Ejemplo n.º 1
0
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (100000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (100000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (100000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (100000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) 
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i!=0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                    h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i]))
                    if not result:
                        raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold))
                    h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, 
                        msg='exec percentile: %s too different from expected: %s' % (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1==0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2') 
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols,1)
                    self.assertEqual(numRows,len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                    )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 2
0
def simpleCheckKMeans(self, modelResult, parameters, numRows, numCols, labels):
    # labels should have the ignored columns removed
    # numCols should be decremented by the ignored columns
    # the names order should then match the labels order

    output = modelResult['models'][0]['output']
    # print "model output:", dump_json(output)
    # find out what results we get
    ko = KMeansOutput(output)
    if 1==0:
        for attr, value in ko.__dict__.iteritems():
            # create some python prints to use
            print "%s = ko.%s # %s" % (attr, attr, value)

    # these should sum to the rows in the dataset
    rows = ko.rows # [78, 5, 41, 76]
    model_category = ko.model_category # Clustering
    iters = ko.iters # 11.0
    schema_version = ko.schema_version # 2
    domains = ko.domains # [None, None, None, None, None, None, None, None, None, None, None, None, None, None]
    # 
    names = ko.names # [u'STR', u'OBS', u'AGMT', u'FNDX', u'HIGD', u'DEG', u'CHK', u'AGP1', u'AGMN', u'NLV', u'LIV', u'WT', u'AGLP', u'MST']
    schema_name = ko.schema_name # KMeansModelOutputV2
    schema_type = ko.schema_type # KMeansOutput
    ncats = ko.ncats # 0
    clusters = ko.clusters # [ 4 lists of centers ]
    mse = ko.mse # 505.632581773
    mses = ko.mses # [476.37866653867707, 563.7343365736649, 606.3007046232348, 477.5260498976912]

    if numRows:
        assert numRows==sum(rows)

    if 'K' in parameters:
        K = parameters['K']
        assert len(mses) == K
        assert len(clusters) == K
        assert len(rows) == K

    if numCols:
        assert len(names) == numCols, \
            "Need to pass correct numCols after ignored columns decrement %s %s" % (len(names), numCols)
        for c in clusters:
            assert len(c) == numCols, "%s %s" % (len(c), numCols)

    # this should be true 
    if labels:
        assert len(labels) == numCols, \
            "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
        assert len(labels) == len(names), \
            "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
        assert labels == names

    if 'max_iters' in parameters:
        max_iters = parameters['max_iters']
        assert max_iters >= iters

    # we could check the centers are within the min/max of each column
    for i,c in enumerate(clusters):
        for n in c:
            if math.isnan(float(n)):
                raise Exception("cluster", i, "has NaN:", n, "center:", c)

    # shouldn't have any errors
    check_sandbox_for_errors()

    # create a tuple for each cluster result, then sort by rows for easy comparison
    # maybe should sort by centers?
    # put a cluster index in there too, (leftmost) so we don't lose teack
    tuples = zip(range(len(clusters)), mses, rows, clusters)
    tuplesSorted = sorted(tuples, key=itemgetter(3))

    # undo for printing what the caller will see
    ids, mses, rows, clusters = zip(*tuplesSorted)

    print "\nmse:", mse
    print "iters:", iters
    print "ids:", ids
    print "mses:", mses
    print "rows:", rows
    for i,c in enumerate(clusters):
        print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)

    
    # to unzip the tuplesSorted. zip with *
    # ids, mses, rows, clusters = zip(*tuplesSorted)
    return tuplesSorted, iters, mse, names
Ejemplo n.º 3
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]
            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 4
0
    def test_summary2_int2B(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100000, 1, 'B.hex', 2533255332, 2633256000,   ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) 
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            if expected[0]:
                self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # apparently we can't estimate any more
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0 
Ejemplo n.º 5
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            ('cars.csv', 'c.hex', [
                (None, None,None,None,None,None),
                ('economy (mpg)', None,None,None,None,None),
                ('cylinders', None,None,None,None,None),
            ],
            ),
            ('runifA.csv', 'A.hex', [
                (None,  1.00, 25.00, 50.00, 75.00, 100.0),
                ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
            ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            ('runif.csv', 'x.hex', [
                (None,  1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
            ],
            ),
            ('runifB.csv', 'B.hex', [
                (None,  1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                ('x', -100.00, -50.1, 0.974, 51.7, 100,00),
            ],
            ),

            ('runifC.csv', 'C.hex', [
                (None,  1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
            ],
            ),
        ]


        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname,
                schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):",  h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
                    pctile = stats['pctile']


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0
                    

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname!='' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                        if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'):
                            raise Exception("stopping to look")
                                


                scipyCol += 1

            trial += 1
Ejemplo n.º 6
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            (
                'cars.csv',
                'c.hex',
                [
                    (None, None, None, None, None, None),
                    ('economy (mpg)', None, None, None, None, None),
                    ('cylinders', None, None, None, None, None),
                ],
            ),
            (
                'runifA.csv',
                'A.hex',
                [
                    (None, 1.00, 25.00, 50.00, 75.00, 100.0),
                    ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
                ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            (
                'runif.csv',
                'x.hex',
                [
                    (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                    ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                    ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                    ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
                ],
            ),
            (
                'runifB.csv',
                'B.hex',
                [
                    (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                    ('x', -100.00, -50.1, 0.974, 51.7, 100, 00),
                ],
            ),
            (
                'runifC.csv',
                'C.hex',
                [
                    (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                    ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
                ],
            ),
        ]

        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata',
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname,
                                     expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(
                    source_key=hex_key,
                    column=column['colname'],
                    quantile=quantile,
                    max_qbins=MAX_QBINS,
                    multiple_pass=2,
                    interpolation_type=7)  # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:",
                               q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype = stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype != 'Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                        mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                        sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct = [
                        0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9,
                        0.95, 0.99
                    ]
                    pctile = stats['pctile']

                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange / (MAX_QBINS - 2)
                    maxErr = 0.5 * expectedBin  # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(
                        mins[0],
                        expected[1],
                        tol=maxErr,
                        msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(
                        pctile[3],
                        expected[2],
                        tol=maxErr,
                        msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(
                        pctile[5],
                        expected[3],
                        tol=maxErr,
                        msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(
                        pctile[7],
                        expected[4],
                        tol=maxErr,
                        msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(
                        maxs[0],
                        expected[5],
                        tol=maxErr,
                        msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype != 'Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname != '' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                        )

                        if False and h2o_util.approxEqual(pctile[5],
                                                          0.990238116744,
                                                          tol=0.002,
                                                          msg='stop here'):
                            raise Exception("stopping to look")

                scipyCol += 1

            trial += 1
Ejemplo n.º 7
0
    def test_summary2_exp(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (100, 1, 'x.hex', 1, 20000, ('C1', None, None, None, None, None)),
            (1000, 1, 'x.hex', -5000, 0, ('C1', None, None, None, None, None)),
            (10000, 1, 'x.hex', -100000, 100000, ('C1', None, None, None, None,
                                                  None)),
            (100000, 1, 'x.hex', -1, 1, ('C1', None, None, None, None, None)),
            (1000000, 1, 'A.hex', 1, 100, ('C1', None, None, None, None,
                                           None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax,
             expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname,
                                                           rowCount,
                                                           colCount,
                                                           lambd=LAMBD,
                                                           SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:",
                             h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != '' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )
Ejemplo n.º 8
0
    def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs):
        super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint)

        print self.withinss # per cluster
        print self.totss
        print self.tot_withinss
        print self.betweenss

        # should model builder add this to the kmeansResult?
        if 'python_elapsed' in kmeansResult:
            self.python_elapsed = kmeansResult['python_elapsed']

        size = self.size # [78, 5, 41, 76]
        model_category = self.model_category # Clustering
        iterations = self.iterations # 11.0
        domains = self.domains 
        names = self.names 
        categorical_column_count = self.categorical_column_count # 0
        centers_data = self.centers.data # [ 4 lists of centers ]
        # h2o returns it sliced across centers. transpose the list of lists, drop 0 which is the cluster id?
        # gotta turn the strings into numbers
        centersStr = [list(x) for x in zip(*centers_data[1:])]
        centers = [map(float, c) for c in centersStr]

        withinss = self.withinss
        totss = self.totss

        if numRows:
            assert numRows==sum(size)

        if 'k' in parameters:
            k = parameters['k']
            assert len(centers) == k
            assert len(size) == k

        if numCols:
            assert len(names) == numCols, \
                "Need to pass correct numCols after ignored columns decrement %s %s %s" % (len(names), numCols, names)
            for c in centers:
                assert len(c) == numCols, "%s %s" % (len(c), numCols)

        # this should be true 
        if labels:
            assert len(labels) == numCols, \
                "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
            assert len(labels) == len(names), \
                "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
            assert labels == names

        if 'max_iterations' in parameters:
            max_iterations = parameters['max_iterations']
            assert max_iterations >= iterations

        # we could check the centers are within the min/max of each column
        for i,c in enumerate(centers):
            for n in c:
                if math.isnan(float(n)):
                    raise Exception("cluster", i, "has NaN:", n, "center:", c)

        # create a tuple for each cluster result, then sort by rows for easy comparison
        # maybe should sort by centers?
        # put a cluster index in there too, (leftmost) so we don't lose track
        tuples = zip(range(len(centers)), centers, size, withinss)
        # print "tuples:", dump_json(tuples)
        # can we sort on the sum of the centers?
        self.tuplesSorted = sorted(tuples, key=lambda tup: sum(tup[1]))

        print "iterations:", iterations
        # undo for printing what the caller will see
        ids, centers, size, withinss = zip(*self.tuplesSorted)
        for i,c in enumerate(centers):
            print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)
            print "rows_per_cluster[%s]: " % i, size[i]
            print "withinss[%s]: " % i, withinss[i]
            print "size[%s]:" % i, size[i]

        print "KMeansObj created for:", "???"# vars(self)

        # shouldn't have any errors
        check_sandbox_for_errors()
    def test_summary2_uniform_int_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, "B.hex", 1, 1000 * M, ("C1", 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)),
            (ROWS, 1, "B.hex", 1, 1000, ("C1", 1.0, 250.0, 500.0, 750.0, 1000.0)),
            (ROWS, 1, "x.hex", 1, 20000, ("C1", 1.0, 5000.0, 10000.0, 15000.0, 20000.0)),
            (ROWS, 1, "x.hex", -5000, 0, ("C1", -5000.00, -3750.0, -2500.0, -1250.0, 0)),
            (ROWS, 1, "x.hex", -100000, 100000, ("C1", -100000.0, -50000.0, 0, 50000.0, 100000.0)),
            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),
            (ROWS, 1, "B.hex", 1, 10000, ("C1", 1.0, 2501.0, 5001.0, 7501.0, 10000.0)),
            (ROWS, 1, "B.hex", -100, 100, ("C1", -100.0, -50.0, 0.0, 50.0, 100.0)),
            (ROWS, 1, "C.hex", 1, 100000, ("C1", 1.0, 25001.0, 50001.0, 75001.0, 100000.0)),
            # (ROWS, 1, 'C.hex', -101, 101,          ('C1',  -101, -51, -1, 49.0, 100.0)),
        ]
        if not DO_REAL:
            # only 3 integer values!
            tryList.append((1000000, 1, "x.hex", -1, 1, ("C1", -1.0, -1, 0.000, 1, 1.00)))

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?

            maxDelta = (expectedMax - expectedMin) / (MAX_QBINS + 0.0)
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * 0.01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=60, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult["summaries"][0]
            colname = column["colname"]
            self.assertEqual(colname, expected[0])

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            h2o_util.assertApproxEqual(
                pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
            )
            h2o_util.assertApproxEqual(
                pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
            )
            h2o_util.assertApproxEqual(
                pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
            )

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, rowCount / len(hcnt), delta=0.01 * rowCount, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 10
0
    def test_summary2_uniform_int_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        M = 100
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'B.hex', 1, 1000 * M,
             ('C1', 1.0 * M, 250.0 * M, 500.0 * M, 750.0 * M, 1000.0 * M)),
            (ROWS, 1, 'B.hex', 1, 1000, ('C1', 1.0, 250.0, 500.0, 750.0,
                                         1000.0)),
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.0, 5000.0, 10000.0, 15000.0,
                                          20000.0)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5000.00, -3750.0, -2500.0,
                                          -1250.0, 0)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100000.0, -50000.0, 0,
                                                 50000.0, 100000.0)),

            # (ROWS, 1, 'A.hex', 1, 101,             ('C1',   1.0, 26.00, 51.00, 76.00, 101.0)),
            # (ROWS, 1, 'A.hex', -99, 99,            ('C1',  -99, -49.0, 0, 49.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.0, 2501.0, 5001.0, 7501.0,
                                          10000.0)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.0, -50.0, 0.0, 50.0,
                                           100.0)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.0, 25001.0, 50001.0,
                                           75001.0, 100000.0)),
            # (ROWS, 1, 'C.hex', -101, 101,          ('C1',  -101, -51, -1, 49.0, 100.0)),
        ]
        if not DO_REAL:
            # only 3 integer values!
            tryList.append(\
                (1000000, 1, 'x.hex', -1, 1,              ('C1',  -1.0, -1, 0.000, 1, 1.00)) \
                )

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / (MAX_QBINS + 0.0))
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=60,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0
            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 11
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,
             ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,
             ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0,
             ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]),
            (ROWS, 1, 'A.hex', 1.0, 100.0,
             ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,
             ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]),
            (ROWS, 1, 'B.hex', 1.0, 10000.0,
             ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
            (ROWS, 1, 'C.hex', 1.0, 100000.0,
             ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount,
                                                       colCount, expectedMin,
                                                       expectedMax,
                                                       SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange / (MAX_QBINS - 2)
            maxDelta = 1 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight?
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=column['colname'],
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2,
                                       interpolation_type=7)  # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       rel=.00001,
                                       msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       rel=.00001,
                                       msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance,
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance,
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance,
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 12
0
    def test_mixed_int_enum_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # this should be a sorted list for comparing to hbrk in the histogram in h2o summary?
        enumList = ['abc', 'def', 'ghi']
        # numbers 1 and 2 may not be counted as NAs correctly? what about blank space?
        intList = [0, 1, 2, '']
        expectedList = [ 'abc', 'def', 'ghi']

        tryList = [
            # not sure about this case
            # some of the cases interpret as ints now (not as enum)
            (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1], intList[0:2], False),
            # colname, (min, COLS5th, 50th, 75th, max)
            (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2], intList[0:1], True),
            # fails this case
            (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1], intList[0:1], True),
            (ROWS, COLS, 'd.hex', enumList[0: ], expectedList[0: ], intList[0:1], True),
            (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2], intList[0:2], True),
            # this case seems to fail
            (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1], intList[0:2], True),
            # this seems wrong also
            (ROWS, COLS, 'g.hex', enumList[0: ], expectedList[0: ], intList[0:2], True),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k,v in column.iteritems():
                    setattr(self, k, v) # achieves self.k = v

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList:
            # max error = half the bin size?
        
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0,
                hex_key=hex_key, timeoutSecs=10, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            print "numRows:", numRows, "numCols:", numCols
            inspect = h2o_cmd.runInspect(None, hex_key)
            
            print "\nTrial:", trial, csvFilename

            # this summary only does one column?
            # assert colCount == len(columns), "%s %s" % (colCount, len(columns))

            for i in range(colCount):
                summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i+1))
                h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

                columns = summaryResult['frames'][0]['columns']
                co = Column(columns[0])
                # how are enums binned. Stride of 1? (what about domain values)
                coList = [
                    co.base,
                    len(co.bins),
                    len(co.data),
                    co.domain,
                    co.label,
                    co.maxs,
                    co.mean,
                    co.mins,
                    co.missing,
                    co.ninfs,
                    co.pctiles,
                    co.pinfs,
                    co.precision,
                    co.sigma,
                    co.str_data,
                    co.stride,
                    co.type,
                    co.zeros,
                    ]

                coNameList = [
                    'co.base',
                    'len(co.bins)',
                    'len(co.data)',
                    'co.domain',
                    'co.label',
                    'co.maxs',
                    'co.mean',
                    'co.mins',
                    'co.missing',
                    'co.ninfs',
                    'co.pctiles',
                    'co.pinfs',
                    'co.precision',
                    'co.sigma',
                    'co.str_data',
                    'co.stride',
                    'co.type',
                    'co.zeros',
                    ]

                

                for c,n in zip(coList, coNameList):
                    print n+":", c

                print "len(co.bins):", len(co.bins)

                print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
                # what is precision. -1?
                # This can go to NaN (string) with big numbers
                # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

                # can be None if col is all NA
                # print "FIX! hacking the co.pctiles because it's short by two"
                # pctiles = [0] + co.pctiles + [0]

                assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (co.zeros, numRows)

                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.type, 'Enum', "trial %s: Expecting type to be Enum for %s col colname %s" % (trial, i, colname))

                if ENABLE_ASSERTS and resultIsEnum:
                    # not always there
                    cardinality = len(co.domain)
                    self.assertEqual(cardinality, len(enumChoices),
                        msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices))) 

                # assume I create the list above in the same order that h2o will show the order. sorted?
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.bins, enumChoices) 

                hcntTotal = sum(co.bins)
                numRowsCreated = rowCount + len(intChoices)
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i])

                self.assertEqual(numRows, numRowsCreated,
                    msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated))

                nacnt = co.missing
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(nacnt, expectedNaCnt[i], 
                        "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt))

                # FIX! no checks for the case where it got parsed as int column!
            trial += 1
Ejemplo n.º 13
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,        ['C1',  0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,        ['C1',  -5000.0, -3750.0, -2550.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1',  -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0,           ['C1',  -1.0, -0.50, 0.0, 0.50, 1.0]),

            (ROWS, 1, 'A.hex', 1.0, 100.0,          ['C1',   1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,         ['C1',  -99.0, -50.0, 0.0, 50.0, 99.0]),

            (ROWS, 1, 'B.hex', 1.0, 10000.0,        ['C1',   1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),

            (ROWS, 1, 'C.hex', 1.0, 100000.0,       ['C1',   1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, 
                expectedMin, expectedMax, SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange/(MAX_QBINS-2)
            maxDelta = 0.5 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight? 
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta 

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, 
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, 
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, 
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1


            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                    )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 14
0
    def __init__(self, kmeansResult, parameters, numRows, numCols, labels, noPrint=False, **kwargs):
        super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'], "KMeans", noPrint=noPrint)

        print self.withinmse # per cluster
        print self.avgss
        print self.avgwithinss
        print self.avgbetweenss

        # should model builder add this to the kmeansResult?
        if 'python_elapsed' in kmeansResult:
            self.python_elapsed = kmeansResult['python_elapsed']

        rows = self.rows # [78, 5, 41, 76]
        model_category = self.model_category # Clustering
        iters = self.iters # 11.0
        domains = self.domains # [None, None, None, None, None, None, None, None, None, None, None, None, None, None]
        names = self.names 
        # [u'STR', u'OBS', u'AGMT', u'FNDX', u'HIGD', u'DEG', u'CHK', u'AGP1', u'AGMN', u'NLV', u'LIV', u'WT', u'AGLP', u'MST']
        ncats = self.ncats # 0
        clusters = self.clusters # [ 4 lists of centers ]
        withinmse = self.withinmse
        avgss = self.avgss

        if numRows:
            assert numRows==sum(rows)

        if 'k' in parameters:
            k = parameters['k']
            assert len(clusters) == k
            assert len(rows) == k

        if numCols:
            assert len(names) == numCols, \
                "Need to pass correct numCols after ignored columns decrement %s %s" % (len(names), numCols)
            for c in clusters:
                assert len(c) == numCols, "%s %s" % (len(c), numCols)

        # this should be true 
        if labels:
            assert len(labels) == numCols, \
                "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
            assert len(labels) == len(names), \
                "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
            assert labels == names

        if 'max_iters' in parameters:
            max_iters = parameters['max_iters']
            assert max_iters >= iters

        # we could check the centers are within the min/max of each column
        for i,c in enumerate(clusters):
            for n in c:
                if math.isnan(float(n)):
                    raise Exception("cluster", i, "has NaN:", n, "center:", c)

        # shouldn't have any errors
        check_sandbox_for_errors()

        # create a tuple for each cluster result, then sort by rows for easy comparison
        # maybe should sort by centers?
        # put a cluster index in there too, (leftmost) so we don't lose track
        tuples = zip(range(len(clusters)), withinmse, rows, clusters)
        self.tuplesSorted = sorted(tuples, key=itemgetter(3))

        # undo for printing what the caller will see
        ids, withinmse, rows, clusters = zip(*self.tuplesSorted)
        print "iters:", iters
        print "ids:", ids
        print "withinmse:", withinmse
        print "rows:", rows
        for i,c in enumerate(clusters):
            print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)
        
        print "KMeansObj created for:", "???"# vars(self)
Ejemplo n.º 15
0
    def test_mixed_int_enum_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # this should be a sorted list for comparing to hbrk in the histogram in h2o summary?
        enumList = ['abc', 'def', 'ghi']
        # numbers 1 and 2 may not be counted as NAs correctly? what about blank space?
        intList = [0, 1, 2, '']
        expectedList = ['abc', 'def', 'ghi']

        tryList = [
            # not sure about this case
            # some of the cases interpret as ints now (not as enum)
            (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], False),
            # colname, (min, COLS5th, 50th, 75th, max)
            (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2],
             intList[0:1], True),
            # fails this case
            (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1],
             intList[0:1], True),
            (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1],
             True),
            (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2],
             intList[0:2], True),
            # this case seems to fail
            (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], True),
            # this seems wrong also
            (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2],
             True),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k, v in column.iteritems():
                    setattr(self, k, v)  # achieves self.k = v

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, enumChoices, enumExpected,
             intChoices, resultIsEnum) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, enumChoices,
                                              intChoices)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           check_header=0,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            print "numRows:", numRows, "numCols:", numCols
            inspect = h2o_cmd.runInspect(None, hex_key)

            print "\nTrial:", trial, csvFilename

            # this summary only does one column?
            # assert colCount == len(columns), "%s %s" % (colCount, len(columns))

            for i in range(colCount):
                summaryResult = h2o_cmd.runSummary(key=hex_key,
                                                   column="C" + str(i + 1))
                h2o.verboseprint("summaryResult:",
                                 h2o.dump_json(summaryResult))

                # columns = summaryResult['frames'][0]['columns']
                co = Column(summaryResult)
                # how are enums binned. Stride of 1? (what about domain values)
                coList = [
                    co.base,
                    len(co.bins),
                    len(co.data),
                    co.domain,
                    co.label,
                    co.maxs,
                    co.mean,
                    co.mins,
                    co.missing,
                    co.ninfs,
                    co.pctiles,
                    co.pinfs,
                    co.precision,
                    co.sigma,
                    co.str_data,
                    co.stride,
                    co.type,
                    co.zeros,
                ]

                coNameList = [
                    'co.base',
                    'len(co.bins)',
                    'len(co.data)',
                    'co.domain',
                    'co.label',
                    'co.maxs',
                    'co.mean',
                    'co.mins',
                    'co.missing',
                    'co.ninfs',
                    'co.pctiles',
                    'co.pinfs',
                    'co.precision',
                    'co.sigma',
                    'co.str_data',
                    'co.stride',
                    'co.type',
                    'co.zeros',
                ]

                for c, n in zip(coList, coNameList):
                    print n + ":", c

                print "len(co.bins):", len(co.bins)

                print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                    co.mean)
                # what is precision. -1?
                # This can go to NaN (string) with big numbers
                # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

                # can be None if col is all NA
                # print "FIX! hacking the co.pctiles because it's short by two"
                # pctiles = [0] + co.pctiles + [0]

                assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (
                    co.zeros, numRows)

                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        co.type, 'enum',
                        "Expecting co.type %s to be 'enum' for %s co label  %s"
                        % (co.type, i, co.label))

                if ENABLE_ASSERTS and resultIsEnum:
                    # not always there
                    cardinality = len(co.domain)
                    self.assertEqual(
                        cardinality,
                        len(enumChoices),
                        msg="trial %s: cardinality %s should be %s" %
                        (trial, cardinality, len(enumChoices)))

                # assume I create the list above in the same order that h2o will show the order. sorted?
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.bins, enumChoices)

                hcntTotal = sum(co.bins)
                numRowsCreated = rowCount + len(intChoices)
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hcntTotal,
                                     numRowsCreated - expectedNaCnt[i])

                self.assertEqual(numRows,
                                 numRowsCreated,
                                 msg="trial %s: numRows %s should be %s" %
                                 (trial, numRows, numRowsCreated))

                nacnt = co.missing
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        nacnt, expectedNaCnt[i],
                        "trial %s: Column %s Expected %s. nacnt %s incorrect" %
                        (trial, i, expectedNaCnt[i], nacnt))

                # FIX! no checks for the case where it got parsed as int column!
            trial += 1
Ejemplo n.º 16
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # co.label, (min, 25th, 50th, 75th, max)
# parse setup error
#            (1,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (5,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
#            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
#            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
#            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
#            (1000000, 1, 'A.hex', 1, 100,          ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k,v in column.iteritems():
                    setattr(self, k, v) # achieves self.k = v

        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, 
                rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', 
                hex_key=hex_key, timeoutSecs=30, doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)

            inspect = h2o_cmd.runInspect(key=parse_key)
            missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
            print "\n" + csvFilename
            # column 0?
            summaryResult = h2o_cmd.runSummary(key=hex_key, column='C1')
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # default_pctiles
            # isText
            # rows
            # off
            # key
            # checksum

            # only one column
            columns = summaryResult['frames'][0]['columns']
            default_pctiles = summaryResult['frames'][0]['default_pctiles']
            co = Column(columns[0])
            # how are enums binned. Stride of 1? (what about domain values)
            coList = [
                co.base,
                len(co.bins),
                len(co.data),
                co.domain,
                co.label,
                co.maxs,
                co.mean,
                co.mins,
                co.missing,
                co.ninfs,
                co.pctiles,
                co.pinfs,
                co.precision,
                co.sigma,
                co.str_data,
                co.stride,
                co.type,
                co.zeros,
                ]

            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)

            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            pctiles = [0] + co.pctiles + [0]
            
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(co.label, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(co.mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctiles[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctiles[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctiles[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(co.maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            if expected[1] and expected[5]:
                expectedRange = expected[5] - expected[1]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
            print "co.label:", co.label, "co.maxs (2 places):", mx
            print "co.label:", co.label, "co.mins (2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            print "h2oSummary2MaxErr", maxErr
            if co.label!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctiles[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
Ejemplo n.º 17
0
    def __init__(self,
                 kmeansResult,
                 parameters,
                 numRows,
                 numCols,
                 labels,
                 noPrint=False,
                 **kwargs):
        super(KMeansObj, self).__init__(kmeansResult['models'][0]['output'],
                                        "KMeans",
                                        noPrint=noPrint)

        print self.withinss  # per cluster
        print self.totss
        print self.tot_withinss
        print self.betweenss

        # should model builder add this to the kmeansResult?
        if 'python_elapsed' in kmeansResult:
            self.python_elapsed = kmeansResult['python_elapsed']

        size = self.size  # [78, 5, 41, 76]
        model_category = self.model_category  # Clustering
        iterations = self.iterations  # 11.0
        domains = self.domains
        names = self.names
        categorical_column_count = self.categorical_column_count  # 0
        centers_data = self.centers.data  # [ 4 lists of centers ]
        # h2o returns it sliced across centers. transpose the list of lists, drop 0 which is the cluster id?
        # gotta turn the strings into numbers
        centersStr = [list(x) for x in zip(*centers_data[1:])]
        centers = [map(float, c) for c in centersStr]

        withinss = self.withinss
        totss = self.totss

        if numRows:
            assert numRows == sum(size)

        if 'k' in parameters:
            k = parameters['k']
            assert len(centers) == k
            assert len(size) == k

        if numCols:
            assert len(names) == numCols, \
                "Need to pass correct numCols after ignored columns decrement %s %s %s" % (len(names), numCols, names)
            for c in centers:
                assert len(c) == numCols, "%s %s" % (len(c), numCols)

        # this should be true
        if labels:
            assert len(labels) == numCols, \
                "Need to pass correct labels and numCols after ignored columns removal %s %s" % (len(labels), numCols)
            assert len(labels) == len(names), \
                "Need to pass correct labels after ignored columns removal %s %s" % (len(labels), len(names))
            assert labels == names

        if 'max_iterations' in parameters:
            max_iterations = parameters['max_iterations']
            assert max_iterations >= iterations

        # we could check the centers are within the min/max of each column
        for i, c in enumerate(centers):
            for n in c:
                if math.isnan(float(n)):
                    raise Exception("cluster", i, "has NaN:", n, "center:", c)

        # create a tuple for each cluster result, then sort by rows for easy comparison
        # maybe should sort by centers?
        # put a cluster index in there too, (leftmost) so we don't lose track
        tuples = zip(range(len(centers)), centers, size, withinss)
        # print "tuples:", dump_json(tuples)
        # can we sort on the sum of the centers?
        self.tuplesSorted = sorted(tuples, key=lambda tup: sum(tup[1]))

        print "iterations:", iterations
        # undo for printing what the caller will see
        ids, centers, size, withinss = zip(*self.tuplesSorted)
        for i, c in enumerate(centers):
            print "cluster id %s (2 places):" % ids[i], h2o_util.twoDecimals(c)
            print "rows_per_cluster[%s]: " % i, size[i]
            print "withinss[%s]: " % i, withinss[i]
            print "size[%s]:" % i, size[i]

        print "KMeansObj created for:", "???"  # vars(self)

        # shouldn't have any errors
        check_sandbox_for_errors()
Ejemplo n.º 18
0
def runSummary(node=None,
               key=None,
               column=None,
               expected=None,
               maxDelta=None,
               noPrint=False,
               **kwargs):
    if not key: raise Exception('No key for Summary')
    if not node: node = h2o_nodes.nodes[0]
    # return node.summary(key, **kwargs)

    i = InspectObj(key=key)
    # just so I don't have to change names below
    missingList = i.missingList
    labelList = i.labelList
    numRows = i.numRows
    numCols = i.numCols
    print "labelList:", labelList
    assert labelList is not None

    # doesn't take indices? only column labels?
    # return first column, unless specified

    if not (column is None or isinstance(column, (basestring, int))):
        raise Exception(
            "column param should be string or integer index or None %s %s" %
            (type(column), column))

    # either return the first col, or the col indentified by label. the column identifed could be string or index?
    if column is None:  # means the summary json when we ask for col 0, will be what we return (do all though)
        colNameToDo = labelList
        colIndexToDo = range(len(labelList))
    elif isinstance(column, int):
        colNameToDo = [labelList[column]]
        colIndexToDo = [column]
    elif isinstance(column, basestring):
        colNameToDo = [column]
        if column not in labelList:
            raise Exception("% not in labellist: %s" % (column, labellist))
        colIndexToDo = [labelList.index(column)]
    else:
        raise Exception("wrong type %s for column %s" % (type(column), column))

    # we get the first column as result after walking across all, if no column parameter
    desiredResult = None
    for (colIndex, colName) in zip(colIndexToDo, colNameToDo):
        print "doing summary on %s %s" % (colIndex, colName)
        # ugly looking up the colIndex
        co = SummaryObj(key=key, colIndex=colIndex, colName=colName)
        if not desiredResult:
            desiredResult = co

        if not noPrint:
            for k, v in co:
                # only print [0] of mins and maxs because of the e308 values when they don't have dataset values
                if k == 'mins' or k == 'maxs':
                    print "%s[0]" % k, v[0]
                else:
                    print k, v

        if expected is not None:
            print "len(co.histogram_bins):", len(co.histogram_bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(
                co.sigma)

            # print "FIX! hacking the co.percentiles because it's short by two"
            # if co.percentiles:
            #     percentiles = [0] + co.percentiles + [0]
            # else:
            #     percentiles = None
            percentiles = co.percentiles
            assert len(co.percentiles) == len(co.default_percentiles)

            # the thresholds h2o used, should match what we expected
            # expected = [0] * 5
            # Fix. doesn't check for expected = 0?

            # max of one bin
            if maxDelta is None:
                maxDelta = (co.maxs[0] - co.mins[0]) / 1000

            if expected[0]:
                h2o_util.assertApproxEqual(co.mins[0],
                                           expected[0],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[1]:
                h2o_util.assertApproxEqual(
                    percentiles[2],
                    expected[1],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    percentiles[4],
                    expected[2],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    percentiles[6],
                    expected[3],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(co.maxs[0],
                                           expected[4],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            MAX_QBINS = 1000
            if expected[0] and expected[4]:
                expectedRange = expected[4] - expected[0]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange / (MAX_QBINS - 2)
                maxErr = expectedBin  # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(percentiles)

            # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column
            mx = h2o_util.twoDecimals(co.maxs[0])
            mn = h2o_util.twoDecimals(co.mins[0])

            print "co.label:", co.label, "co.percentiles (2 places):", pt
            print "co.default_percentiles:", co.default_percentiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! why would percentiles be None? enums?
            if pt is None:
                compareActual = mn, [None] * 3, mx
            else:
                compareActual = mn, pt[2], pt[4], pt[6], mx

            h2p.green_print("actual min/25/50/75/max co.label:", co.label,
                            "(2 places):", compareActual)
            h2p.green_print("expected min/25/50/75/max co.label:", co.label,
                            "(2 places):", expected)

    return desiredResult
Ejemplo n.º 19
0
    def test_summary2_exp(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (5,     1, 'x.hex', 1, 20000,         ['C1', None, None, None, None, None]),
            (10,     1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (100,    1, 'x.hex', 1, 20000,        ['C1', None, None, None, None, None]),
            (1000,   1, 'x.hex', -5000, 0,        ['C1', None, None, None, None, None]),
            (10000,  1, 'x.hex', -100000, 100000, ['C1', None, None, None, None, None]),
            (100000, 1, 'x.hex', -1, 1,           ['C1', None, None, None, None, None]),
            (1000000, 1, 'A.hex', 1, 100,         ['C1', None, None, None, None, None]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(csvPathname, 
                rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE)
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            expected[1] = expectedMin
            expected[5] = expectedMax

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0, hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            expectedPct= [0.001, 0.001, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            if expected[1] and expected[5]:
                expectedRange = expected[5] - expected[1]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0


            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname!='' and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=False,
                    col=scipyCol,
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxErr,
                    )
Ejemplo n.º 20
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5 * ROWS, 1, 'x.hex', 1, 20000,
             ['C1', 1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5 * ROWS, 1, 'x.hex', -5000, 0,
             ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1 * ROWS, 1, 'x.hex', -100000, 100000,
             ['C1', -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1 * ROWS, 1, 'x.hex', -1, 1,
             ['C1', -1.05, -0.48, 0.0087, 0.50, 1.00]),
            (1 * ROWS, 1, 'A.hex', 1, 100,
             ['C1', 1.05, 26.00, 51.00, 76.00, 100.0]),
            (1 * ROWS, 1, 'A.hex', -99, 99, ['C1', -99, -50.0, 0, 50.00, 99]),
            (1 * ROWS, 1, 'B.hex', 1, 10000,
             ['C1', 1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1 * ROWS, 1, 'B.hex', -100, 100,
             ['C1', -100.10, -50.0, 0.85, 51.7, 100, 00]),
            (1 * ROWS, 1, 'C.hex', 1, 100000,
             ['C1', 1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1 * ROWS, 1, 'C.hex', -101, 101,
             ['C1', -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin) / 1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [
                co.base,
                len(co.bins),
                len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins,
                co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision,
                co.sigma, co.str_data, co.stride, co.type, co.zeros
            ]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(
                co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr = "[%s]" % ",".join(map(str, probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(algo='quantile',
                                          model_id=model_key,
                                          training_frame=parse_key,
                                          parameters=parameters,
                                          timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][
                0]  # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                )
            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 21
0
    def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', None, None, None, None, None)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList:
            h2o.beta_features = False

            if pathPrefix:
                csvPathname = pathPrefix + "/" + csvFilename
            else:
                csvPathname = csvFilename

            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
            if skipHeader:
                header = 1
            else:
                header = 0
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
                schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            # okay to get more cols than we want
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0])

                quantile = 0.5 if DO_MEDIAN else OTHER_Q
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2)
                qresult = q['result']
                qresult_single = q['result_single']
                qresult_iterations = q['iterations']
                qresult_interpolated = q['interpolated']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
                h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
                print h2o.dump_json(q)

                self.assertLess(qresult_iterations, 16, 
                    msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    pctile = stats['pctile']

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    ## ignore for blank colnames, issues with quoted numbers
                    # covtype is too big to do in scipy
                    if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data':
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=skipHeader, # important!!
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else OTHER_Q,
                            h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            )


                scipyCol += 1

            trial += 1
Ejemplo n.º 22
0
    def test_summary2_small(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 100, 'x.hex', [-1, 0,
                                  1], ('C1', None, None, 0, None, None)),
            (None, 1000, 'x.hex', [-1, 0,
                                   1], ('C1', None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values,
                              SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS,
                                               timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=0,
                                       interpolation_type=7,
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg=
                "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?"
            )

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       numRows / len(hcnt),
                                       delta=1 + .01 * numRows,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Ejemplo n.º 23
0
    def test_summary2_int2B(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (100000, 1, 'B.hex', 2533255332, 2633256000,   ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/(MAX_QBINS + 0.0)) 
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta
            # also need to add some variance due to random distribution?
            # maybe a percentage of the mean
            distMean = (expectedMax - expectedMin) / 2
            maxShift = distMean * .01
            maxDelta = maxDelta + maxShift

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=60, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            if expected[0]:
                self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # apparently we can't estimate any more
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            scipyCol = 0 
Ejemplo n.º 24
0
def runSummary(node=None, key=None, expected=None, column=None, **kwargs):
    if not key: raise Exception('No key for Summary')
    if not node: node = h2o_nodes.nodes[0]
    # return node.summary(key, **kwargs)

    class Column(object):
        def __init__(self, column):
            assert isinstance(column, dict)
            for k,v in column.iteritems():
                setattr(self, k, v) # achieves self.k = v

        def __iter__(self):
            for attr, value in self.__dict__.iteritems():
                yield attr, value

    inspect = runInspect(key=key)
    # change missingList definition: None if all empty, otherwise align to cols. 0 if 0?
    missingList, labelList, numRows, numCols = infoFromInspect(inspect)

    # doesn't take indices? only column labels?
    lastChecksum = None
    # return first column, unless specified
    desiredResult = None
    for label in labelList:
        print "doing summary on %s" % label
        summaryResult = node.summary(key=key, column=label)
        if not desiredResult or (column and column==label):
            desiredResult = summaryResult
        
        verboseprint("column", column, "summaryResult:", dump_json(summaryResult))

        # this should be the same for all the cols? Or does the checksum change?
        frame = summaryResult['frames'][0]
        default_pctiles = frame['default_pctiles']
        checksum = frame['checksum']
        rows = frame['rows']
        columns = frame['columns']

        # assert len(columns) == numCols
        assert rows == numRows
        assert checksum !=0 and checksum is not None
        assert rows!=0 and rows is not None
        assert not frame['isText']
        # FIX! why is frame['key'] = None here?
        # assert frame['key'] == key, "%s %s" % (frame['key'], key)

        # it changes?
        # assert not lastChecksum or lastChecksum == checksum

        lastChecksum = checksum

        # only one column
        co = Column(columns[0])
        # how are enums binned. Stride of 1? (what about domain values)
        coList = [co.base, len(co.bins), len(co.data),
            co.domain, co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles,
            co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]

        # for c in coList:
        #    print c
        for k,v in co:
            print k, v

        print "len(co.bins):", len(co.bins)
        print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
        # what is precision. -1?
        print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

        print "FIX! hacking the co.pctiles because it's short by two"
        
        if co.pctiles:
            pctiles = [0] + co.pctiles + [0]
        else:
            pctiles = None

        # the thresholds h2o used, should match what we expected
        if expected ==None:
            expected = [0] * 5
        # Fix. doesn't check for expected = 0?
        if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, 
            msg='min is not approx. expected')
        if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, 
            msg='25th percentile is not approx. expected')
        if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, 
            msg='50th percentile (median) is not approx. expected')
        if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, 
            msg='75th percentile is not approx. expected')
        if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, 
            msg='max is not approx. expected')

        # figure out the expected max error
        # use this for comparing to sklearn/sort
        MAX_QBINS = 1000
        if expected[0] and expected[4]:
            expectedRange = expected[4] - expected[0]
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange/(MAX_QBINS-2)
            maxErr = expectedBin # should we have some fuzz for fp?

        else:
            print "Test won't calculate max expected error"
            maxErr = 0

        pt = h2o_util.twoDecimals(pctiles)
        mx = h2o_util.twoDecimals(co.maxs)
        mn = h2o_util.twoDecimals(co.mins)

        print "co.label:", co.label, "co.pctiles (2 places):", pt
        print "default_pctiles:", default_pctiles
        print "co.label:", co.label, "co.maxs: (2 places):", mx
        print "co.label:", co.label, "co.mins: (2 places):", mn

        # FIX! why would pctiles be None? enums?
        if pt is None:
            compareActual = mn[0], [None] * 3, mx[0]
        else:
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]

        h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
        h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected)

    return desiredResult
Ejemplo n.º 25
0
    def test_summary2_uniform_w_NA(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0, 15000.0,
                                          20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                          -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                       1.00)),
            (ROWS, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                        100.0)),
            (ROWS, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (ROWS, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                          7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                           100, 00)),
            (ROWS, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                           75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18, 49.28,
                                           100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               noPrint=False,
                                               max_qbins=MAX_QBINS,
                                               numRows=numRows,
                                               numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1 + NA_ROW_RATIO) * rowCount,
                             numRows,
                             msg="numRows %s should be %s" %
                             (numRows, (1 + NA_ROW_RATIO) * rowCount))

            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?

                e = rowCount / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b,
                                       e,
                                       delta=2 * e,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
Ejemplo n.º 26
0
def runSummary(node=None, key=None, column=None, expected=None, maxDelta=None, noPrint=False, **kwargs):
    if not key: raise Exception('No key for Summary')
    if not node: node = h2o_nodes.nodes[0]
    # return node.summary(key, **kwargs)

    i = InspectObj(key=key)
    # just so I don't have to change names below
    missingList = i.missingList
    labelList = i.labelList
    numRows = i.numRows
    numCols = i.numCols

    # doesn't take indices? only column labels?
    # return first column, unless specified

    if not (column is None or isinstance(column, (basestring, int))):
        raise Exception("column param should be string or integer index or None %s %s" % (type(column), column))

    # either return the first col, or the col indentified by label. the column identifed could be string or index?
    if column is None: # means the summary json when we ask for col 0, will be what we return (do all though)
        colNameToDo = labelList
        colIndexToDo = range(len(labelList))
    elif isinstance(column, int):
        colNameToDo = [labelList[column]]
        colIndexToDo = [column]
    elif isinstance(column, basestring):
        colNameToDo = [column]
        colIndexToDo = [labelList.index[column]]
    else:
        raise Exception("wrong type %s for column %s" % (type(column), column))

    # we get the first column as result after walking across all, if no column parameter
    desiredResult = None
    for (colIndex, colName) in zip(colIndexToDo, colNameToDo):
        print "doing summary on %s %s" % (colIndex, colName)
        # ugly looking up the colIndex
        co = SummaryObj(key=key, colIndex=colIndex, colName=colName)
        if not desiredResult:
            desiredResult = co

        if not noPrint:
            for k,v in co:
                # only print [0] of mins and maxs because of the e308 values when they don't have dataset values
                if k=='mins' or k=='maxs':
                    print "%s[0]" % k, v[0]
                else:
                    print k, v

        if expected is not None:
            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            # what is precision. -1?
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            
            if co.pctiles:
                pctiles = [0] + co.pctiles + [0]
            else:
                pctiles = None

            # the thresholds h2o used, should match what we expected
                # expected = [0] * 5
            # Fix. doesn't check for expected = 0?

            # max of one bin
            if maxDelta is None:
                maxDelta = (co.maxs[0] - co.mins[0])/1000

            if expected[0]: h2o_util.assertApproxEqual(co.mins[0], expected[0], tol=maxDelta, 
                msg='min is not approx. expected')
            if expected[1]: h2o_util.assertApproxEqual(pctiles[3], expected[1], tol=maxDelta, 
                msg='25th percentile is not approx. expected')
            if expected[2]: h2o_util.assertApproxEqual(pctiles[5], expected[2], tol=maxDelta, 
                msg='50th percentile (median) is not approx. expected')
            if expected[3]: h2o_util.assertApproxEqual(pctiles[7], expected[3], tol=maxDelta, 
                msg='75th percentile is not approx. expected')
            if expected[4]: h2o_util.assertApproxEqual(co.maxs[0], expected[4], tol=maxDelta, 
                msg='max is not approx. expected')

            # figure out the expected max error
            # use this for comparing to sklearn/sort
            MAX_QBINS = 1000
            if expected[0] and expected[4]:
                expectedRange = expected[4] - expected[0]
                # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                # the extra bin for the max value, is an extra bin..ignore
                expectedBin = expectedRange/(MAX_QBINS-2)
                maxErr = expectedBin # should we have some fuzz for fp?

            else:
                print "Test won't calculate max expected error"
                maxErr = 0

            pt = h2o_util.twoDecimals(pctiles)

            # only look at [0] for now...bit e308 numbers if unpopulated due to not enough unique values in dataset column
            mx = h2o_util.twoDecimals(co.maxs[0])
            mn = h2o_util.twoDecimals(co.mins[0])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "co.default_pctiles:", co.default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! why would pctiles be None? enums?
            if pt is None:
                compareActual = mn, [None] * 3, mx
            else:
                compareActual = mn, pt[3], pt[5], pt[7], mx

            h2p.green_print("actual min/25/50/75/max co.label:", co.label, "(2 places):", compareActual)
            h2p.green_print("expected min/25/50/75/max co.label:", co.label, "(2 places):", expected)

    return desiredResult
Ejemplo n.º 27
0
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0,
                                            15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                            -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                   1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                         1.00)),
            (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                          100.0)),
            (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                            7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                             100, 00)),
            (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                             75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18,
                                             49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999
            ]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount,
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i != 0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (
                        hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" %
                                    h2o.dump_json(resultExec))
                    h2p.blue_print(
                        "\nthreshold: %.2f Exec quantile: %s Summary2: %s" %
                        (threshold, result, pt[i]))
                    if not result:
                        raise Exception(
                            "exec result: %s for quantile: %s is bad" %
                            (result, threshold))
                    h2o_util.assertApproxEqual(
                        result,
                        pctile[i],
                        tol=maxDelta,
                        msg=
                        'exec percentile: %s too different from expected: %s' %
                        (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1 == 0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2')
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols, 1)
                    self.assertEqual(numRows, len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                )

            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 28
0
    def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList:

            if pathPrefix:
                csvPathname = pathPrefix + "/" + csvFilename
            else:
                csvPathname = csvFilename

            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
            if skipHeader:
                header = 1
            else:
                header = 0
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
                schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0])

                quantile = 0.5 if DO_MEDIAN else OTHER_Q
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2)
                qresult = q['result']
                qresult_single = q['result_single']
                qresult_iterations = q['iterations']
                qresult_interpolated = q['interpolated']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
                h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
                print h2o.dump_json(q)

                self.assertLess(qresult_iterations, 16, 
                    msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    pctile = stats['pctile']

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected')


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?
                    # hack?
                    maxErr = maxErr * 2
                    print "maxErr:", maxErr

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    ## ignore for blank colnames, issues with quoted numbers
                    # covtype is too big to do in scipy
                    if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data':
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=skipHeader, # important!!
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else OTHER_Q,
                            h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                scipyCol += 1

            trial += 1
Ejemplo n.º 29
0
    def test_summary2_uniform_w_NA(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, noPrint=False, max_qbins=MAX_QBINS, numRows=numRows, numCols=numCols)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']

            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')

            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "numRows:", numRows, "rowCount: ", rowCount
            self.assertEqual((1+NA_ROW_RATIO) * rowCount, numRows, 
                msg="numRows %s should be %s" % (numRows, (1+NA_ROW_RATIO) * rowCount))


            # don't check the last bin
            # we sometimes get a messed up histogram for all NA cols? just don't let them go thru here
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                
                e = rowCount/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                # NA rows should be ignored
                self.assertAlmostEqual(b, e, delta=2*e,
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            scipyCol = 1

            h2i.delete_keys_at_all_nodes()
Ejemplo n.º 30
0
    def test_quant_cmp_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (5*ROWS, 1, 'x.hex', 1, 20000,        ['C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00]),
            (5*ROWS, 1, 'x.hex', -5000, 0,        ['C1', -5001.00, -3750.0, -2445, -1200.0, 99]),
            (1*ROWS, 1, 'x.hex', -100000, 100000, ['C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0]),
            (1*ROWS, 1, 'x.hex', -1, 1,           ['C1',  -1.05, -0.48, 0.0087, 0.50, 1.00]),

            (1*ROWS, 1, 'A.hex', 1, 100,          ['C1',   1.05, 26.00, 51.00, 76.00, 100.0]),
            (1*ROWS, 1, 'A.hex', -99, 99,         ['C1',  -99, -50.0, 0, 50.00, 99]),

            (1*ROWS, 1, 'B.hex', 1, 10000,        ['C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00]),
            (1*ROWS, 1, 'B.hex', -100, 100,       ['C1',  -100.10, -50.0, 0.85, 51.7, 100,00]),

            (1*ROWS, 1, 'C.hex', 1, 100000,       ['C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00]),
            (1*ROWS, 1, 'C.hex', -101, 101,       ['C1',  -100.10, -50.45, -1.18, 49.28, 100.00]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
            colname = expected[0]
            maxDelta = ((expectedMax - expectedMin)/1000.0) / 2.0

            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            # need the full pathname when python parses the csv for numpy/sort
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            #***************************
            # Parse
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            numRows = pA.numRows
            numCols = pA.numCols
            parse_key = pA.parse_key
            # this guy can take json object as first thing, or re-read with key
            iA = h2o_cmd.InspectObj(parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])

            #***************************
            # Summary
            co = h2o_cmd.runSummary(key=parse_key)
            default_pctiles = co.default_pctiles

            coList = [ co.base, len(co.bins), len(co.data), co.domain,
                co.label, co.maxs, co.mean, co.mins, co.missing, co.ninfs, co.pctiles,
                co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]
            for c in coList:
                print c

            print "len(co.bins):", len(co.bins)
            print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
            print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

            print "FIX! hacking the co.pctiles because it's short by two"
            summ_pctiles = [0] + co.pctiles + [0]

            pt = h2o_util.twoDecimals(summ_pctiles)
            mx = h2o_util.twoDecimals(co.maxs)
            mn = h2o_util.twoDecimals(co.mins)
            exp = h2o_util.twoDecimals(expected[1:])

            print "co.label:", co.label, "co.pctiles (2 places):", pt
            print "default_pctiles:", default_pctiles
            print "co.label:", co.label, "co.maxs: (2 places):", mx
            print "co.label:", co.label, "co.mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                mn[0], pt[3], pt[5], pt[7], mx[0])
            h2p.green_print("min/25/50/75/max co.label:", co.label, "(2 places):",\
                exp[0], exp[1], exp[2], exp[3], exp[4])

            #***************************
            # Quantile
            # the thresholds h2o used, should match what we expected

            # using + here seems to result in an odd tuple..doesn't look right to h2o param
            # so went with this. Could add '[' and ']' to the list first, before the join.
            probsStr  = "[%s]" % ",".join(map(str,probsList))
            parameters = {
                'model_id': "a.hex",
                'training_frame': parse_key,
                'validation_frame': parse_key,
                'ignored_columns': None,
                'probs': probsStr,
            }

            model_key = 'qhex'
            bmResult = h2o.n0.build_model(
                algo='quantile',
                model_id=model_key,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=10)
            bm = OutputObj(bmResult, 'bm')

            msec = bm.jobs[0]['msec']
            print "bm msec", msec

            # quantile result is just a job result to a key
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0], 'model')

            print "model.output:", model.output
            print "model.output:['quantiles']", model.output['quantiles']
            print "model.output:['iterations']", model.output['iterations']
            print "model.output:['names']", model.output['names']
            quantiles = model.output['quantiles'][0] # why is this a double array
            iterations = model.output['iterations']
            assert iterations == 11, iterations
            print "quantiles: ", quantiles
            print "iterations: ", iterations

            # cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # cmm = OutputObj(cmmResult, 'cmm')

            # mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            # mm = OutputObj(mmResult, 'mm')

            # prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            # pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
            h2o_cmd.runStoreView()

            trial += 1
            # compare the last threshold
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=CHECK_PCTILE,
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=quantiles[CHECK_PCTILE_INDEX],
                    )
            h2o.nodes[0].remove_all_keys()
Ejemplo n.º 31
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else 0.999
            q = h2o.nodes[0].quantiles(
                source_key=hex_key,
                column=0,
                interpolation_type=7,
                quantile=quantile,
                max_qbins=MAX_QBINS,
                multiple_pass=2,
            )
            qresult = q["result"]
            qresult_single = q["result_single"]
            qresult_iterations = q["iterations"]
            qresult_interpolated = q["interpolated"]
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?",
            )

            # only one column
            column = summaryResult["summaries"][0]

            colname = column["colname"]

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Ejemplo n.º 32
0
    def test_summary2_exp(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        LAMBD = random.uniform(0.005, 0.5)
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (10, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)),
            (100, 1, "x.hex", 1, 20000, ("C1", None, None, None, None, None)),
            (1000, 1, "x.hex", -5000, 0, ("C1", None, None, None, None, None)),
            (10000, 1, "x.hex", -100000, 100000, ("C1", None, None, None, None, None)),
            (100000, 1, "x.hex", -1, 1, ("C1", None, None, None, None, None)),
            (1000000, 1, "A.hex", 1, 100, ("C1", None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        # rangeMin and rangeMax are not used right now
        for (rowCount, colCount, hex_key, rangeMin, rangeMax, expected) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname, "lambd:", LAMBD
            (expectedMin, expectedMax) = write_syn_dataset(
                csvPathname, rowCount, colCount, lambd=LAMBD, SEED=SEEDPERFILE
            )
            print "expectedMin:", expectedMin, "expectedMax:", expectedMax
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("Summary2 summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult["summaries"][0]
            colname = column["colname"]
            coltype = column["type"]
            nacnt = column["nacnt"]
            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
            pctile = stats["pctile"]
            # the thresholds h2o used, should match what we expected
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            print "Can't estimate the bin distribution"

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if colname != "" and expected[scipyCol]:
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    skipHeader=True,
                    col=scipyCol,
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    # h2oQuantilesExact=qresult,
                )