Example #1
0
    def test_exec2_quantile_na_scalar(self):
        for execExpr in initList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=180)

        for (execExpr, num) in exprList:
            start = time.time()
            resultExec, result = h2e.exec_expr(h2o.nodes[0],
                                               execExpr,
                                               resultKey=None,
                                               timeoutSecs=180)
            print 'exec end took', time.time() - start, 'seconds'
            h2p.blue_print("h2o exec quantiles result:", result)
            self.assertEqual(
                result,
                expectedP,
                msg="Checking exec quantiles median, expectedP: %s result: %s"
                % (expectedP, result))
            print h2o.dump_json(resultExec)
            # do the quantiles page on the created key
            kwargs = {
                'column': 0,
                'quantile': QUANTILE,
                'multiple_pass': 2,
                'max_qbins': 1000,
            }
            q = h2o.nodes[0].quantiles(source_key='ddd', **kwargs)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertEqual(qresult_iterations,
                             3,
                             msg="should take 3 iterations")

            # self.assertEqual(qresult_interpolated, True, msg="Should say it's interpolating")

            self.assertEqual(
                qresult,
                expectedP,
                msg="Checking quantilespage median, expectedP: %s result: %s" %
                (expectedP, qresult))

            inspect = h2o_cmd.runInspect(key='abc')
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "numCols:", numCols
            print "numRows:", numRows
            self.assertEqual(numCols, 1)
            self.assertEqual(numRows, num)

            h2o.check_sandbox_for_errors()
    def test_exec2_quantile_na_scalar(self):
        h2o.beta_features = True
        for execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180)

        for (execExpr, num) in exprList:
            start = time.time()
            resultExec, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=180)
            print 'exec end took', time.time() - start, 'seconds'
            h2p.blue_print("h2o exec quantiles result:", result)
            self.assertEqual(result, expectedP, msg="Checking exec quantiles median, expectedP: %s result: %s" % (expectedP, result))
            print h2o.dump_json(resultExec)
            # do the quantiles page on the created key
            kwargs = {
                'column': 0,
                'quantile': QUANTILE,
                'multiple_pass': 2,
                'max_qbins': 1000,
            }
            q = h2o.nodes[0].quantiles(source_key='ddd', **kwargs)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertEqual(qresult_iterations, 3, msg="should take 3 iterations")

            # self.assertEqual(qresult_interpolated, True, msg="Should say it's interpolating")
            
            self.assertEqual(qresult, expectedP, msg="Checking quantilespage median, expectedP: %s result: %s" % (expectedP, qresult))

            inspect = h2o_cmd.runInspect(key='abc')
            numCols = inspect['numCols']
            numRows = inspect['numRows']
            print "numCols:", numCols
            print "numRows:", numRows
            self.assertEqual(numCols, 1)
            self.assertEqual(numRows, num)

            h2o.check_sandbox_for_errors()
Example #3
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4 * 3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600,
                         "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(
            h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!")
        h2p.blue_print(
            "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs"
            % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime < maxTime):  # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #4
0
File: cloud.py Project: 100star/h2o
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 
        h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime)
        h2p.red_print("This is just for fun")
        h2p.yellow_print("So is this")

        while (totalTime<maxTime): # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            print "Checking sandbox log files"
            h2o.check_sandbox_for_errors(cloudShutdownIsError=True)

        start = time.time()
        h2i.delete_keys_at_all_nodes()
        elapsed = time.time() - start
        print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #5
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 

        h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime)
        if CHECK_WHILE_SLEEPING:        
            h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime)

        h2p.red_print("No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long")
        h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...")


        while (totalTime<maxTime): # die after 4 hours
            time.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)

            ### h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:        
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1==0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #6
0
    def test_build_for_clone(self):
        # python gets confused about which 'start' if I used start here
        elapsed = time.time() - beginning
        print "\n%0.2f seconds to get here from start" % elapsed

        # might as well open a browser on it? (because the ip/port will vary
        # maybe just print the ip/port for now
        ## h2b.browseTheCloud()

        maxTime = 4*3600
        totalTime = 0
        incrTime = 60
        h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.")
        print "Will check h2o logs every", incrTime, "seconds"
        print "Should be able to run another test using h2o-nodes.json to clone cloud"
        print "i.e. h2o.build_cloud_with_json()"
        print "Bad test if a running test shuts down the cloud. I'm supposed to!\n"

        h2p.green_print("To watch cloud in browser follow address:")
        h2p.green_print("   http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port))
        h2p.blue_print("You can start a test (or tests) now!") 

        h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime)
        if CHECK_WHILE_SLEEPING:        
            h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime)

        h2p.red_print("No checking of logs while sleeping, or check of cloud status")
        h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long")
        h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...")


        while (totalTime<maxTime): # die after 4 hours
            h2o.sleep(incrTime)
            totalTime += incrTime
            # good to touch all the nodes to see if they're still responsive
            # give them up to 120 secs to respond (each individually)
            h2o.verify_cloud_size(timeoutSecs=120)
            if CHECK_WHILE_SLEEPING:        
                print "Checking sandbox log files"
                h2o.check_sandbox_for_errors(cloudShutdownIsError=True)
            else:
                print str(datetime.datetime.now()), h2o.python_cmd_line, "still here", totalTime, maxTime, incrTime

        # don't do this, as the cloud may be hung?
        if 1==0:
            print "Shutting down cloud, but first delete all keys"
            start = time.time()
            h2i.delete_keys_at_all_nodes()
            elapsed = time.time() - start
            print "delete_keys_at_all_nodes(): took", elapsed, "secs"
Example #7
0


# an approx? (was good when comparing to h2o type 2)
alphap=0.4
betap=0.4

# this is type 7
alphap=1
betap=1


from scipy import stats
a1 = stats.scoreatpercentile(target, per=100*OTHER_T, interpolation_method='fraction')
h2p.red_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap)
h2p.red_print("scipy stats.mstats.mquantiles:", a2)
targetFP.sort()
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.red_print("sort algo:", b)
h2p.red_print( "from h2o (multi):", quantiles[0])

print "Now looking at the sorted list..same thing"
h2p.blue_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP, prob=[OTHER_T], alphap=alphap, betap=betap)
h2p.blue_print("scipy stats.mstats.mquantiles:", a2)
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.blue_print("sort algo:", b)
h2p.blue_print( "from h2o (multi):", quantiles[0])

    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (100000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (100000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (100000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (100000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) 
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i!=0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                    h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (threshold, result, pt[i]))
                    if not result:
                        raise Exception("exec result: %s for quantile: %s is bad" % (result, threshold))
                    h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, 
                        msg='exec percentile: %s too different from expected: %s' % (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1==0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (hex_key, ",".join(map(str,thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2') 
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols,1)
                    self.assertEqual(numRows,len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                    )

            h2o.nodes[0].remove_all_keys()
Example #9
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (ROWS, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (ROWS, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (ROWS, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (ROWS, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (ROWS, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (ROWS, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (ROWS, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (ROWS, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (ROWS, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]
            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    )

            h2o.nodes[0].remove_all_keys()
Example #10
0
def import_only(node=None, schema='local', bucket=None, path=None,
    timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, **kwargs):

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern)  = ("", path)

    h2o.verboseprint("head:", head)
    h2o.verboseprint("pattern:", pattern)

    # to train users / okay here
    if re.search(r"[\*<>{}[\]~`]", head):
       raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))

    if schema=='put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
           raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path: 
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) 
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")
    
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)
        return (None, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        folderURI = 'nfs:/' + folderPath
        importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')
    
        # strip leading / in head if present
        if bucket and head!="":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o.nodes[0]
        if schema=='s3' or node.redirect_import_folder_to_s3_path:
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"
            importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs)

        elif schema=='s3n' or node.redirect_import_folder_to_s3n_path:
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)

        elif schema=='maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "maprfs:///" + folderOffset
            importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)

        elif schema=='hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)

        else: 
            raise Exception("schema not understood: %s" % schema)

    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
Example #11
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 2, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 100, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 1000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, -1, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?
        
            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta


            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=0, interpolation_type=7,
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(qresult_iterations, 16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")


            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = twoDecimals(pctile)
            mx = twoDecimals(maxs)
            mn = twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0
            if DO_TRY_SCIPY and colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                print scipyCol, pctile[10]
                generate_scipy_comparison(csvPathnameFull, col=scipyCol,
                     # h2oMedian=pctile[5 if DO_MEDIAN else 10], result_single)
                    h2oMedian=pctile[5 if DO_MEDIAN else 10], h2oMedian2=qresult)



            h2i.delete_keys_at_all_nodes()
Example #12
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            ('cars.csv', 'c.hex', [
                (None, None,None,None,None,None),
                ('economy (mpg)', None,None,None,None,None),
                ('cylinders', None,None,None,None,None),
            ],
            ),
            ('runifA.csv', 'A.hex', [
                (None,  1.00, 25.00, 50.00, 75.00, 100.0),
                ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
            ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            ('runif.csv', 'x.hex', [
                (None,  1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
            ],
            ),
            ('runifB.csv', 'B.hex', [
                (None,  1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                ('x', -100.00, -50.1, 0.974, 51.7, 100,00),
            ],
            ),

            ('runifC.csv', 'C.hex', [
                (None,  1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
            ],
            ),
        ]


        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata', csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname,
                schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):",  h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]
                    pctile = stats['pctile']


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0
                    

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxErr, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxErr, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxErr, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxErr, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxErr, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname!='' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                        if False and h2o_util.approxEqual(pctile[5], 0.990238116744, tol=0.002, msg='stop here'):
                            raise Exception("stopping to look")
                                


                scipyCol += 1

            trial += 1
Example #13
0
def import_only(node=None, schema='local', bucket=None, path=None,
    timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, 
    importParentDir=True, **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.
    if schema=='put':
        h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +  
            "\nMeans multi-machine with 'put' will fail")
        schema = 'local'

    if src_key and schema!='put':
        raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o_nodes.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern)  = ("", path)

    verboseprint("head:", head)
    verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
           raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
           raise Exception("h2o path %s can't be regex. path= was %s" % (head, path))

    if schema=='put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
            raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path: 
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) 
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        
        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")
    
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)

        # hmm.. what should importResult be in the put case
        # set it to None. No import is done, and shouldn't be used if you're doing schema='put'
        importResult = None
        
        return (None, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder?
        # is it used for key matching by others?

        # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path
        # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets
        # importPattern = folderURI + "/" + pattern
        # could include this on the entire importPattern if we no longer have regex basename in h2o-dev?
          
        # folderURI = 'nfs:/' + folderPath
        folderURI = 'nfs:/' + os.path.realpath(folderPath)
        if importParentDir:
            finalImportString = folderPath
        else:
            finalImportString = folderPath + "/" + pattern
        importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')
    
        # strip leading / in head if present
        if bucket and head!="":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        if h2o_args.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o_nodes.nodes[0]
        if schema=='s3' or node.redirect_import_folder_to_s3_path:
            # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"

            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='s3n' or node.redirect_import_folder_to_s3n_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution 
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        elif schema=='hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs)

        else: 
            raise Exception("schema not understood: %s" % schema)

    print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString
    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
Example #14
0
print "stress the 1000 fixed binning based on (max-min)/1000"
a = [
    -1.0000002e10, -1.0000001e10, -1.0000000e10, -1.0000002e9, -1.0000001e9,
    -1.0000000e9, -1.0000002e6, -1.0000001e6, -1.0000000e6, -1.0000002e3,
    -1.0000001e3, -1.0000000e3, -1.0, 0.0000000, 1.0, 1.0000002e3, 1.0000001e3,
    1.0000000e3, 1.0000002e6, 1.0000001e6, 1.0000000e6, 1.0000002e9,
    1.0000001e9, 1.0000000e9, 1.0000002e10, 1.0000001e10, 1.0000000e10
]

initList = ["ddd = c(%s)" % ",".join(map(str, a))]

# get expected result
a.sort()
expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear')
print "expectedP:", expectedP
h2p.blue_print("sort result, expectedP:", expectedP)

exprList = [
    ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1),
]


class Basic(unittest.TestCase):
    def tearDown(self):
        h2o.check_sandbox_for_errors()

    @classmethod
    def setUpClass(cls):
        global SEED
        SEED = h2o.setup_random_seed()
        h2o.init(1, java_heap_GB=1)
Example #15
0
    def test_summary2_small(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 2, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 10, 'x.hex', [-1, 0, 1], ('C1', None, None, 0, None, None)),
            (None, 100, 'x.hex', [-1, 0,
                                  1], ('C1', None, None, 0, None, None)),
            (None, 1000, 'x.hex', [-1, 0,
                                   1], ('C1', None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = 'syn_' + "binary" + "_" + str(rowFile) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values,
                              SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS,
                                               timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else .999
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=0,
                                       interpolation_type=7,
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2)
            qresult = q['result']
            qresult_single = q['result_single']
            qresult_iterations = q['iterations']
            qresult_interpolated = q['interpolated']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg=
                "h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?"
            )

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']

            coltype = column['type']
            nacnt = column['nacnt']

            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0],
                                           expected[1],
                                           tol=maxDelta,
                                           msg='min is not approx. expected')
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3],
                    expected[2],
                    tol=maxDelta,
                    msg='25th percentile is not approx. expected')
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5],
                    expected[3],
                    tol=maxDelta,
                    msg='50th percentile (median) is not approx. expected')
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7],
                    expected[4],
                    tol=maxDelta,
                    msg='75th percentile is not approx. expected')
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0],
                                           expected[5],
                                           tol=maxDelta,
                                           msg='max is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(
                    hcnt
                )  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b,
                                       numRows / len(hcnt),
                                       delta=1 + .01 * numRows,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Example #16
0
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, 
   h2oQuantilesApprox=None, h2oQuantilesExact=None, interpolate='linear', quantile=0.50):
    SCIPY_INSTALLED = True
    try:
        import scipy as sp
        import numpy as np
        print "Both numpy and scipy are installed. Will do extra checks"

    except ImportError:
        print "numpy or scipy is not installed. Will only do sort-based checking"
        SCIPY_INSTALLED = false

    target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype,
        skipHeader=skipHeader, preview=5)

    if datatype=='float':
        # to make irene's R runif files first col work (quoted row numbers, integers
        #shouldn't hurt anyone else?
        # strip " from left (ignore leading whitespace
        # strip " from right (ignore leading whitespace
        targetFP= map(float, target)
        # targetFP= np.array(tFP, np.float)
    if datatype=='int':
        targetFP= map(int, target)


    # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
    # numpy.percentile has simple linear interpolate and midpoint
    # need numpy 1.9 for interpolation. numpy 1.8 doesn't have
    # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint')
    # 1.8
    if SCIPY_INSTALLED:
        p = np.percentile(targetFP, quantile*100)
        h2p.red_print("numpy.percentile", p)

        # per = [100 * t for t in thresholds]
        from scipy import stats
        s1 = stats.scoreatpercentile(targetFP, quantile*100)
        h2p.red_print("scipy stats.scoreatpercentile", s1)

        # scipy apparently doesn't have the use of means (type 2)
        # http://en.wikipedia.org/wiki/Quantile
        # it has median (R-8) with 1/3, 1/3

        if 1==0:
            # type 6
            alphap=0
            betap=0

            # type 5 okay but not perfect
            alphap=0.5
            betap=0.5

            # type 8
            alphap=1/3.0
            betap=1/3.0

        if interpolate=='mean':
            # an approx? (was good when comparing to h2o type 2)
            alphap=0.4
            betap=0.4

        if interpolate=='linear':
            # this is type 7
            alphap=1
            betap=1

        s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap)
        s2 = s2List[0]
        # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html
        # type 7 
        # alphap=0.4, betap=0.4, 
        # type 2 not available? (mean)
        # alphap=1/3.0, betap=1/3.0 is approx median?
        h2p.red_print("scipy stats.mstats.mquantiles:", s2)


    # also get the median with a painful sort (h2o_summ.percentileOnSortedlist()
    # inplace sort
    targetFP.sort()

    # this matches scipy type 7 (linear)
    # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear')
    # this matches h2o type 2 (mean)
    # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean')
    b = percentileOnSortedList(targetFP, quantile, interpolate='linear')
    label = str(quantile * 100) + '%'
    h2p.blue_print(label, "from sort:", b)

    if SCIPY_INSTALLED:
        h2p.blue_print(label, "from numpy:", p)
        h2p.blue_print(label, "from scipy 1:", s1)
        h2p.blue_print(label, "from scipy 2:", s2)

    h2p.blue_print(label, "from h2o summary:", h2oSummary2)
    h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox)
        h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.5,
            msg='h2o quantile singlepass is not approx. same as sort algo')

    if h2oQuantilesExact:
        if math.isnan(float(h2oQuantilesExact)):
            raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact)
        h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, 
            msg='h2o quantile multipass is not approx. same as sort algo')

    if h2oSummary2:
        if math.isnan(float(h2oSummary2)):
            raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2)
        h2o_util.assertApproxEqual(h2oSummary2, b, rel=0.5,
            msg='h2o summary2 is not approx. same as sort algo')

    if SCIPY_INSTALLED:
        if h2oQuantilesApprox:
            h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002,
                msg='h2o quantile multipass is not same as numpy.percentile')
            h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002,
                msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile')

        # give us some slack compared to the scipy use of median (instead of desired mean)
        if h2oQuantilesExact:
            if interpolate=='mean':
                h2o_util.assertApproxEqual(h2oQuantilesExact, s2, rel=0.01,
                    msg='h2o quantile multipass is not approx. same as scipy stats.mstats.mquantiles')
            else:
                h2o_util.assertApproxEqual(h2oQuantilesExact, s2, tol=0.0000002,
                    msg='h2o quantile multipass is not same as scipy stats.mstats.mquantiles')

        # see if scipy changes. nope. it doesn't 
        if 1==0:
            a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap)
            h2p.red_print("after sort")
            h2p.red_print("scipy stats.mstats.mquantiles:", s3)
Example #17
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,
             ['C1', 0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,
             ['C1', -5000.0, -3750.0, -2500.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0,
             ['C1', -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0, ['C1', -1.0, -0.50, 0.0, 0.50, 1.0]),
            (ROWS, 1, 'A.hex', 1.0, 100.0,
             ['C1', 1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,
             ['C1', -99.0, -50.0, 0.0, 50.0, 99.0]),
            (ROWS, 1, 'B.hex', 1.0, 10000.0,
             ['C1', 1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
            (ROWS, 1, 'C.hex', 1.0, 100000.0,
             ['C1', 1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,
             ['C1', -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount,
                                                       colCount, expectedMin,
                                                       expectedMax,
                                                       SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange / (MAX_QBINS - 2)
            maxDelta = 1 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight?
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta

            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key,
                                       column=column['colname'],
                                       quantile=quantile,
                                       max_qbins=MAX_QBINS,
                                       multiple_pass=2,
                                       interpolation_type=7)  # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       rel=.00001,
                                       msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       rel=.00001,
                                       msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99
            ]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance,
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance,
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance,
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt) > 2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                self.assertAlmostEqual(b,
                                       rowCount / len(hcnt),
                                       delta=.01 * rowCount,
                                       msg="Bins not right. b: %s e: %s" %
                                       (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname,
                            "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                )

            h2o.nodes[0].remove_all_keys()
Example #18
0
def findQuantile(d, dmin, dmax, threshold):
    # return the value at the threshold, or the mean of the two rows that bound it.
    # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer
    maxIterations = 30

    # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere)
    totalRows = len(d)
    # Used to have 
    desiredBinCnt = BIN_COUNT
    maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues

    # initialize
    newValStart = dmin
    newValEnd   = dmax
    newValRange = newValEnd - newValStart
    desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine.
    newValBinSize  = newValRange / (desiredBinCnt + 0.0)
    newLowCount = 0 # count of rows below the bins
    # yes there is no newHighCount. Created during the pass, though.

    # state shared by each pass
    assert maxBinCnt > 0

    hcnt2 = [None for b in range(maxBinCnt)]
    hcnt2_min = [None for b in range(maxBinCnt)]
    hcnt2_max = [None for b in range(maxBinCnt)]
    hcnt2_low = 0
    hcnt2_high = 0

    assert newValBinSize != 0 # can be negative
    assert newValEnd > newValStart
    assert newValRange > 0

    # break out on stopping condition
    # reuse the histogram array hcnt2[]
    iteration = 0
    done = False
    # append to a list of best guesses per pass
    best_result = []

    def htot2():
        return sum(hcnt2) + hcnt2_low + hcnt2_high
        
    while iteration <= maxIterations and not done:
        h2p.green_print("newValStart", newValStart)
        h2p.green_print("newValEnd", newValEnd)
        h2p.green_print("newValRange", newValRange)
        h2p.green_print("newValBinSize", newValBinSize)
        h2p.green_print("newLowCount", newLowCount)
        h2p.green_print("threshold", threshold)

        valStart = newValStart
        valEnd   = newValEnd
        valRange = newValRange
        valBinSize = newValBinSize
        lowCount = newLowCount
        desiredBinCnt = BIN_COUNT
        maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues

        # playing with creating relative NUDGE values to make sure bin range
        # is always inclusive of target.
        # ratio it down from valBinSize. 
        # It doesn't need to be as big as valBinSize.
        # implicitly, it shouldn't need to be as large as valBinSize
        # can't seem to make it work yet. leave NUDGE=0
        NUDGE = 0

        # init to zero for each pass
        for b in range(maxBinCnt):
            hcnt2[b] = 0.0

        # Init counts outside of the bins
        hcnt2_low = 0
        hcnt2_high = 0

        # minimum value for higher than the bin. Needed for interpolation
        hcnt2_high_min = None

        for val in d:
            # Need to count the stuff outside the bin-gathering, 
            # since threshold compare is based on total row compare
            # on first pass, shouldn't see anything exceed the start/end bounds
            # since those are min/max for the column? (shouldn't be any fp precision issue? or ??)
            # oh wait, this valOffset math creates possible precision issue?
            # maybe we should address it with the NUDGE value below? but what about first pass?
            valOffset = val - valStart
            # where are we zeroing in? (start)
            binIdx2 = int(math.floor(valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide?

            # do some close looking for possible fp arith issues
            cA = valOffset < 0
            cB = binIdx2 < 0
            t = {True: 1, False: 0}
            # we get the 10 case
            if ((cA and not cB) or (not cA and cB)):
                h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \
                    "binIdx2", binIdx2)
            cC = val > valEnd
            cD = binIdx2 >= (maxBinCnt-1) # tighten the compare for printing
            if ((cC and not cD) or (not cC and cD)):
                h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \
                    "binIdx2", binIdx2, "maxBinCnt", maxBinCnt)
                # example hits this case..i.e. the max value
                # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3
                
            if valOffset < 0 or binIdx2<0:
            # if valOffset < 0:
            # if binIdx2<0:
                hcnt2_low += 1
            # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure!
            # have to use both compares, since can wrap the index (due to start/end shift)
            # elif val > valEnd or binIdx2>=(maxBinCnt-1):
            # should this really be a valOffset compare?
            elif val > valEnd or binIdx2 >= maxBinCnt:
            # elif val > valEnd:
            # elif binIdx2>=(maxBinCnt-1):
                if (hcnt2_high==0) or (val < hcnt2_high_min):
                    hcnt2_high_min = val;
                    print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd
                hcnt2_high += 1
            else:
                # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize

                assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \
                    (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize)
                if hcnt2[binIdx2]==0 or (val < hcnt2_min[binIdx2]):
                    hcnt2_min[binIdx2] = val;
                if hcnt2[binIdx2]==0 or (val > hcnt2_max[binIdx2]):
                    hcnt2_max[binIdx2] = val;
                hcnt2[binIdx2] += 1

                # check if we went into the magic extra bin
                if binIdx2 == (maxBinCnt-1):
                    print "\nFP! val went into the extra maxBinCnt bin:", \
                    binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n"
        
            # check the legal states for these two
            # we don't have None for checking hcnt2_high_min in java
            assert hcnt2_high==0 or (hcnt2_high_min is not None)
            assert (hcnt2_high_min is None) or hcnt2_high!=0

        # everything should either be in low, the bins, or high
        totalBinnedRows = htot2()
        print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) 

        assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) 

        # now walk thru and find out what bin to look inside
        currentCnt = hcnt2_low
        targetCntFull = threshold * (totalRows-1)  # zero based indexing
        targetCntInt = int(math.floor(threshold * (totalRows-1)))
        targetCntFract = targetCntFull  - targetCntInt
        assert targetCntFract>=0 and targetCntFract<=1
        print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract

        k = 0
        while ((currentCnt + hcnt2[k]) <= targetCntInt): 
            # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k]
            currentCnt += hcnt2[k]
            # ugly but have to break out if we'd cycle along with == adding h0's until we go too far
            # are we supposed to advance to a none zero bin?
            k += 1 # goes over in the equal case?
            # if currentCnt >= targetCntInt:
            #     break
            if k==maxBinCnt:
                break
            assert k<maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % (k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k-1])

        # format string to match java Log.info() in Quantiles.java
        print "Found k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k]
        assert hcnt2[k]!=1 or hcnt2_min[k]==hcnt2_max[k]

        # some possibily interpolating guesses first, in guess we have to iterate (best guess)
        done = False
        guess = (hcnt2_max[k] - hcnt2_min[k]) / 2

        if currentCnt==targetCntInt:
            if hcnt2[k]>2 and (hcnt2_min[k]==hcnt2_max[k]):
                guess = hcnt2_min[k]
                print "Guess A", guess, k, hcnt2[k]

            if hcnt2[k]==2:
                print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop"
                # no mattter what size the fraction it would be on this number
                guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0
                # no mattter what size the fraction it would be on this number

                if INTERPOLATION_TYPE==2: # type 2 (mean)
                  guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0

                else: # default to type 7 (linear interpolation)
                  # Unlike mean, which just depends on two adjacent values, this adjustment  
                  # adds possible errors related to the arithmetic on the total # of rows.
                  dDiff = hcnt2_max[k] - hcnt2_min[k] # two adjacent..as if sorted!
                  pctDiff = targetCntFract # This is the fraction of total rows
                  guess = hcnt2_min[k] + (pctDiff * dDiff)

                done = False
                print "Guess B", guess

            if hcnt2[k]==1 and targetCntFract==0:
                assert hcnt2_min[k]==hcnt2_max[k]
                guess = hcnt2_min[k]
                done = True
                print "k", k
                print "Guess C", guess

            if hcnt2[k]==1 and targetCntFract!=0:
                assert hcnt2_min[k]==hcnt2_max[k]
                print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
                if k<maxBinCnt:
                    nextK = k + 1 # could put it over maxBinCnt
                else:
                    nextK = k
                while nextK<maxBinCnt and hcnt2[nextK]==0:
                    nextK += 1

                # have the "extra bin" for this
                if nextK >= maxBinCnt:
                    assert hcnt2_high!=0
                    print "Using hcnt2_high_min for interpolate:", hcnt2_high_min
                    nextVal = hcnt2_high_min
                else:
                    print "Using nextK for interpolate:", nextK
                    assert hcnt2[nextK]!=0
                    nextVal = hcnt2_min[nextK]

                guess = (hcnt2_max[k] + nextVal) / 2.0
                # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK

                if INTERPOLATION_TYPE==2: # type 2 (mean)
                    guess = (hcnt2_max[k] + nextVal) / 2.0
                    pctDiff = 0.5
                else: # default to type 7 (linear interpolation)
                    dDiff = nextVal - hcnt2_max[k] # two adjacent, as if sorted!
                    pctDiff = targetCntFract # This is the fraction of total rows
                    guess = hcnt2_max[k] + (pctDiff * dDiff)


                done = True # has to be one above us when needed. (or we're at end)

                print 'k', 'hcnt2_max[k]', 'nextVal'
                print "hello3:", k, hcnt2_max[k], nextVal
                print "\nInterpolating result using nextK: %s nextVal: %s" % (nextK, nextVal)
                print "Guess D", guess

        if not done:
            print "Not done, setting new range",\
                "k: ", k,\
                "currentCnt: ", currentCnt,\
                "hcnt2_min[k]: ", hcnt2_min[k],\
                "hcnt2_max[k]: ", hcnt2_max[k]

            # possible bin leakage at start/end edges due to fp arith.
            # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare
            # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you.
            # Just need to check the one bin below and above k, if they exist. 
            if k > 0 and hcnt2[k-1]>0 and (hcnt2_max[k-1]<hcnt2_min[k]):
                newValStart = hcnt2_max[k-1]
            else:
                newValStart = hcnt2_min[k]

            # subtle. we do put stuff in the extra end bin (see the print above that happens)
            # k might be pointing to one less than that (like k=0 for 1 bin case)
            if k < maxBinCnt and hcnt2[k+1]>0 and (hcnt2_min[k+1]>hcnt2_max[k]):
                print "hello"
                newValEnd = hcnt2_min[k+1]
            else:
                newValEnd = hcnt2_max[k]
            
            newValRange = newValEnd - newValStart 
            # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues?
            newValBinSize = newValRange / (desiredBinCnt + 0.0)
            
            # the start/end should never change if we're just using one bin
            # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations
            # assumes NUDGE is 0
            if NUDGE == 0.0:
                assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\
                    "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd)
            newLowCount = currentCnt
            if newValBinSize==0:
                # assert done or newValBinSize!=0 and live with current guess
                print "Assuming done because newValBinSize is 0."
                print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\
                     (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k])
                guess = newValStart
                print "Guess E", guess
                done = True

            # if we have to interpolate
            # if it falls into this bin, interpolate to this bin means one answer?

            # cover the case above with multiple entris in a bin, all the same value
            # will be zero on the last pass?
            # assert newValBinSize != 0 or done
            # need the count up to but not including newValStart

        best_result.append(guess)
        iteration += 1

        h2p.blue_print("Ending Pass", iteration)
        h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k])
        print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high
        print "was", valStart, valEnd, valRange, valBinSize
        print "next", newValStart, newValEnd, newValRange, newValBinSize

    return best_result[-1]
Example #19
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (ROWS, 1, 'x.hex', 0.0, 20000.0,        ['C1',  0, 5000.0, 10000.0, 15000.0, 20000.0]),
            (ROWS, 1, 'x.hex', -5000.0, 0.0,        ['C1',  -5000.0, -3750.0, -2550.0, -1250.0, 0.0]),
            (ROWS, 1, 'x.hex', -100000.0, 100000.0, ['C1',  -100000.0, -50000.0, 0.0, 50000.0, 100000.0]),
            (ROWS, 1, 'x.hex', -1.0, 1.0,           ['C1',  -1.0, -0.50, 0.0, 0.50, 1.0]),

            (ROWS, 1, 'A.hex', 1.0, 100.0,          ['C1',   1.0, 26.0, 51.0, 76.0, 100.0]),
            (ROWS, 1, 'A.hex', -99.0, 99.0,         ['C1',  -99.0, -50.0, 0.0, 50.0, 99.0]),

            (ROWS, 1, 'B.hex', 1.0, 10000.0,        ['C1',   1.0, 2501.0, 5001.0, 7501.0, 10000.0]),
            (ROWS, 1, 'B.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),

            (ROWS, 1, 'C.hex', 1.0, 100000.0,       ['C1',   1.0, 25001.0, 50001.0, 75001.0, 100000.0]),
            (ROWS, 1, 'C.hex', -100.0, 100.0,       ['C1',  -100.0, -50.0, 0.0, 50.0, 100.0]),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            (actualMax, actualMin) = write_syn_dataset(csvPathname, rowCount, colCount, 
                expectedMin, expectedMax, SEEDPERFILE)
            # adjust the min/max depending on what the min/max actually was!
            # the expected 25%/50%/75% will still be off
            expected[1] = actualMin
            expected[5] = actualMax

            # max error = half the bin size?
            # use this for comparing to sklearn/sort
            expectedRange = expectedMax - expectedMin
            # because of floor and ceil effects due we potentially lose 2 bins (worst case)
            # the extra bin for the max value, is an extra bin..ignore
            expectedBin = expectedRange/(MAX_QBINS-2)
            maxDelta = 0.5 * expectedBin

            # how much error do we get in the random distribution gen? pain. It's a probability issue
            # smaller error likely with larger # of values.
            # the maxDelta used for the scipy/sort compare can be tighter, since it's looking
            # at actual data
            # this is way too coarse. can't get the distribution tight? 
            maxDeltaPlusDistVariance = 10 * maxDelta
            # allow some fuzz in the comparison to scipy/sort
            maxDelta = 1.1 * maxDelta 

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=30, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]
            colname = column['colname']
            self.assertEqual(colname, expected[0])

            quantile = 0.5 if DO_MEDIAN else .999
            # get both answers since we feed both below for checking
            q = h2o.nodes[0].quantiles(source_key=hex_key, column=column['colname'],
                quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2, interpolation_type=7) # linear
            qresult = q['result']
            qresult_single = q['result_single']
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", q['iterations'])
            h2p.blue_print("h2o quantiles interpolated:", q['interpolated'])
            print h2o.dump_json(q)

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            # these should match exactly except for fp compare error?
            h2o_util.assertApproxEqual(mins[0], expected[1], rel=.00001, msg='min is not expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], rel=.00001, msg='max is not expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct= [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDeltaPlusDistVariance, 
                msg='25th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDeltaPlusDistVariance, 
                msg='50th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDeltaPlusDistVariance, 
                msg='75th percentile is not approx. expected for generated uniform range %s %s' %\
                    (expectedMin, expectedMax))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            # too hard to estimate when there are ints now, due to floor/ceil int alignment?
            # don't check the last two bins
            for b in hcnt[1:(-2 if len(hcnt)>2 else -1)]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt)
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1


            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname!='':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull, 
                    col=0, # what col to extract from the csv
                    datatype='float',
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10], 
                    h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                    h2oSummary2MaxErr=maxDelta,
                    )

            h2o.nodes[0].remove_all_keys()
Example #20
0
def import_only(node=None, schema='local', bucket=None, path=None,
    timeoutSecs=30, retryDelaySecs=0.5, initialDelaySecs=0.5, pollTimeoutSecs=180, noise=None,
    benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, 
    importParentDir=True, **kwargs):

    if src_key and schema!='put':
        raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern)  = ("", path)

    h2o.verboseprint("head:", head)
    h2o.verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
           raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
           raise Exception("h2o path %s can't be regex. path= was %s" % (head, path))

    if schema=='put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
           raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path: 
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath) 
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")
    
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)
        return (None, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

          
        folderURI = 'nfs:/' + folderPath
        if importParentDir:
            importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs)
        else:
            importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')
    
        # strip leading / in head if present
        if bucket and head!="":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o.nodes[0]
        if schema=='s3' or node.redirect_import_folder_to_s3_path:
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"
            importResult = node.import_s3(bucket, timeoutSecs=timeoutSecs)

        elif schema=='s3n' or node.redirect_import_folder_to_s3n_path:
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema=='maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema=='hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s hdfs_config: %s" % \
                    (n.use_hdfs, n.hdfs_version, n.hdfs_name_node, n.hdfs_config)
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                importResult = node.import_hdfs(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_hdfs(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        else: 
            raise Exception("schema not understood: %s" % schema)

    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
Example #21
0
    def test_summary2_unifiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # new with 1000 bins. copy expected from R
        tryList = [
            (
                'cars.csv',
                'c.hex',
                [
                    (None, None, None, None, None, None),
                    ('economy (mpg)', None, None, None, None, None),
                    ('cylinders', None, None, None, None, None),
                ],
            ),
            (
                'runifA.csv',
                'A.hex',
                [
                    (None, 1.00, 25.00, 50.00, 75.00, 100.0),
                    ('x', -99.9, -44.7, 8.26, 58.00, 91.7),
                ],
            ),
            # colname, (min, 25th, 50th, 75th, max)
            (
                'runif.csv',
                'x.hex',
                [
                    (None, 1.00, 5000.0, 10000.0, 15000.0, 20000.00),
                    ('D', -5000.00, -3735.0, -2443, -1187.0, 99.8),
                    ('E', -100000.0, -49208.0, 1783.8, 50621.9, 100000.0),
                    ('F', -1.00, -0.4886, 0.00868, 0.5048, 1.00),
                ],
            ),
            (
                'runifB.csv',
                'B.hex',
                [
                    (None, 1.00, 2501.00, 5001.00, 7501.00, 10000.00),
                    ('x', -100.00, -50.1, 0.974, 51.7, 100, 00),
                ],
            ),
            (
                'runifC.csv',
                'C.hex',
                [
                    (None, 1.00, 25002.00, 50002.00, 75002.00, 100000.00),
                    ('x', -100.00, -50.45, -1.135, 49.28, 100.00),
                ],
            ),
        ]

        timeoutSecs = 15
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        timeoutSecs = 60
        for (csvFilename, hex_key, expectedCols) in tryList:

            csvPathname = csvFilename
            csvPathnameFull = h2i.find_folder_and_filename('smalldata',
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)

            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            # okay to vary MAX_QBINS because we adjust the expected accuracy
            summaryResult = h2o_cmd.runSummary(key=hex_key,
                                               max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))
            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname,
                                     expected[0]), colname, expected[0]
                else:
                    # if the colname is None, skip it (so we don't barf on strings on the h2o quantile page
                    scipyCol += 1
                    continue

                quantile = 0.5 if DO_MEDIAN else .999
                # h2o has problem if a list of columns (or dictionary) is passed to 'column' param
                q = h2o.nodes[0].quantiles(
                    source_key=hex_key,
                    column=column['colname'],
                    quantile=quantile,
                    max_qbins=MAX_QBINS,
                    multiple_pass=2,
                    interpolation_type=7)  # for comparing to summary2
                qresult = q['result']
                qresult_single = q['result_single']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", q['iterations'])
                h2p.blue_print("h2o quantiles interpolated:",
                               q['interpolated'])
                print h2o.dump_json(q)

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),

                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype = stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype != 'Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                        mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                        sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    expectedPct = [
                        0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9,
                        0.95, 0.99
                    ]
                    pctile = stats['pctile']

                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange / (MAX_QBINS - 2)
                    maxErr = 0.5 * expectedBin  # should we have some fuzz for fp?

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(
                        mins[0],
                        expected[1],
                        tol=maxErr,
                        msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(
                        pctile[3],
                        expected[2],
                        tol=maxErr,
                        msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(
                        pctile[5],
                        expected[3],
                        tol=maxErr,
                        msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(
                        pctile[7],
                        expected[4],
                        tol=maxErr,
                        msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(
                        maxs[0],
                        expected[5],
                        tol=maxErr,
                        msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype != 'Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    # don't check if colname is empty..means it's a string and scipy doesn't parse right?
                    # need to ignore the car names
                    if colname != '' and expected[scipyCol]:
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=True,
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else 0.999,
                            # FIX! ignore for now
                            h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                        )

                        if False and h2o_util.approxEqual(pctile[5],
                                                          0.990238116744,
                                                          tol=0.002,
                                                          msg='stop here'):
                            raise Exception("stopping to look")

                scipyCol += 1

            trial += 1
    1.0000001e9,
    1.0000000e9,
    1.0000002e10,
    1.0000001e10,
    1.0000000e10
]

initList = [
    "ddd = c(%s)" % ",".join(map(str,a))
]

# get expected result
a.sort()
expectedP = h2o_summ.percentileOnSortedList(a, QUANTILE, interpolate='linear')
print "expectedP:", expectedP
h2p.blue_print("sort result, expectedP:", expectedP)

exprList = [
    ("abc = quantile(ddd[,1], c(%s))" % QUANTILE, 1),
]

class Basic(unittest.TestCase):
    def tearDown(self):
        h2o.check_sandbox_for_errors()

    @classmethod
    def setUpClass(cls):
        global SEED
        SEED = h2o.setup_random_seed()
        h2o.init(1, java_heap_GB=1)
    def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', None, None, None, None, None)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList:
            h2o.beta_features = False

            if pathPrefix:
                csvPathname = pathPrefix + "/" + csvFilename
            else:
                csvPathname = csvFilename

            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
            if skipHeader:
                header = 1
            else:
                header = 0
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
                schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            # okay to get more cols than we want
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0])

                quantile = 0.5 if DO_MEDIAN else OTHER_Q
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2)
                qresult = q['result']
                qresult_single = q['result_single']
                qresult_iterations = q['iterations']
                qresult_interpolated = q['interpolated']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
                h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
                print h2o.dump_json(q)

                self.assertLess(qresult_iterations, 16, 
                    msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    pctile = stats['pctile']

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected')

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    ## ignore for blank colnames, issues with quoted numbers
                    # covtype is too big to do in scipy
                    if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data':
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=skipHeader, # important!!
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else OTHER_Q,
                            h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            )


                scipyCol += 1

            trial += 1
Example #24
0
    def test_summary2_unifiles2(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # new with 1000 bins. copy expected from R
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            ('breadth.csv', 'b.hex', False, [ ('C1', None, None, None, None, None)], 'smalldata', 'quantiles'),
            # ('wonkysummary.csv', 'b.hex', False, [ ('X1', 7, 22, 876713, 100008, 1000046)], 'smalldata', None),
            ('wonkysummary.csv', 'b.hex', True, [ ('X1', 7.00, None, None, None, 1000046.0)], 'smalldata', None),
            ('covtype.data', 'c.hex', False, [ ('C1', None, None, None, None, None)], 'home-0xdiag-datasets', 'standard'),

        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (csvFilename, hex_key, skipHeader, expectedCols, bucket, pathPrefix) in tryList:

            if pathPrefix:
                csvPathname = pathPrefix + "/" + csvFilename
            else:
                csvPathname = csvFilename

            csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True)
            if skipHeader:
                header = 1
            else:
                header = 0
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, 
                schema='put', header=header, hex_key=hex_key, timeoutSecs=10, doSummary=False)

            print "Parse result['destination_key']:", parseResult['destination_key']

            # We should be able to see the parse result?
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            # okay to get more cols than we want
            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            summaries = summaryResult['summaries']

            scipyCol = 0
            for expected, column in zip(expectedCols, summaries):
                colname = column['colname']
                if expected[0]:
                    self.assertEqual(colname, expected[0])

                quantile = 0.5 if DO_MEDIAN else OTHER_Q
                q = h2o.nodes[0].quantiles(source_key=hex_key, column=scipyCol,
                    quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=2)
                qresult = q['result']
                qresult_single = q['result_single']
                qresult_iterations = q['iterations']
                qresult_interpolated = q['interpolated']
                h2p.blue_print("h2o quantiles result:", qresult)
                h2p.blue_print("h2o quantiles result_single:", qresult_single)
                h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
                h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
                print h2o.dump_json(q)

                self.assertLess(qresult_iterations, 16, 
                    msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?")

                # ('',  '1.00', '25002.00', '50002.00', '75002.00', '100000.00'),
                coltype = column['type']
                nacnt = column['nacnt']

                stats = column['stats']
                stattype= stats['type']
                print stattype

                # FIX! we should compare mean and sd to expected?
                # enums don't have mean or sd?
                if stattype!='Enum':
                    mean = stats['mean']
                    sd = stats['sd']
                    zeros = stats['zeros']
                    mins = stats['mins']
                    maxs = stats['maxs']

                    print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
                    print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

                    pct = stats['pct']
                    print "pct:", pct
                    print ""

                    # the thresholds h2o used, should match what we expected
                    pctile = stats['pctile']

                # hack..assume just one None is enough to ignore for cars.csv
                if expected[1]:
                    h2o_util.assertApproxEqual(mins[0], expected[1], rel=0.02, msg='min is not approx. expected')
                if expected[2]:
                    h2o_util.assertApproxEqual(pctile[3], expected[2], rel=0.02, msg='25th percentile is not approx. expected')
                if expected[3]:
                    h2o_util.assertApproxEqual(pctile[5], expected[3], rel=0.02, msg='50th percentile (median) is not approx. expected')
                if expected[4]:
                    h2o_util.assertApproxEqual(pctile[7], expected[4], rel=0.02, msg='75th percentile is not approx. expected')
                if expected[5]:
                    h2o_util.assertApproxEqual(maxs[0], expected[5], rel=0.02, msg='max is not approx. expected')


                # figure out the expected max error
                # use this for comparing to sklearn/sort
                if expected[1] and expected[5]:
                    expectedRange = expected[5] - expected[1]
                    # because of floor and ceil effects due we potentially lose 2 bins (worst case)
                    # the extra bin for the max value, is an extra bin..ignore
                    expectedBin = expectedRange/(MAX_QBINS-2)
                    maxErr = 0.5 * expectedBin # should we have some fuzz for fp?
                    # hack?
                    maxErr = maxErr * 2
                    print "maxErr:", maxErr

                else:
                    print "Test won't calculate max expected error"
                    maxErr = 0

                hstart = column['hstart']
                hstep = column['hstep']
                hbrk = column['hbrk']
                hcnt = column['hcnt']

                for b in hcnt:
                    # should we be able to check for a uniform distribution in the files?
                    e = .1 * numRows
                    # self.assertAlmostEqual(b, .1 * rowCount, delta=.01*rowCount,
                    #     msg="Bins not right. b: %s e: %s" % (b, e))

                if stattype!='Enum':
                    pt = h2o_util.twoDecimals(pctile)
                    print "colname:", colname, "pctile (2 places):", pt
                    mx = h2o_util.twoDecimals(maxs)
                    mn = h2o_util.twoDecimals(mins)
                    print "colname:", colname, "maxs: (2 places):", mx
                    print "colname:", colname, "mins: (2 places):", mn

                    # FIX! we should do an exec and compare using the exec quantile too
                    actual = mn[0], pt[3], pt[5], pt[7], mx[0]
                    print "min/25/50/75/max colname:", colname, "(2 places):", actual
                    print "maxs colname:", colname, "(2 places):", mx
                    print "mins colname:", colname, "(2 places):", mn

                    ## ignore for blank colnames, issues with quoted numbers
                    # covtype is too big to do in scipy
                    if colname!='' and expected[scipyCol] and csvFilename!= 'covtype.data':
                        # don't do for enums
                        # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                        h2o_summ.quantile_comparisons(
                            csvPathnameFull,
                            skipHeader=skipHeader, # important!!
                            col=scipyCol,
                            datatype='float',
                            quantile=0.5 if DO_MEDIAN else OTHER_Q,
                            h2oSummary2=pctile[5 if DO_MEDIAN else OTHER_Q_SUMM_INDEX],
                            h2oQuantilesApprox=qresult_single,
                            h2oQuantilesExact=qresult,
                            h2oSummary2MaxErr=maxErr,
                            )

                scipyCol += 1

            trial += 1
Example #25
0
def quantile_comparisons(csvPathname,
                         skipHeader=False,
                         col=0,
                         datatype='float',
                         h2oSummary2=None,
                         h2oSummary2MaxErr=None,
                         h2oQuantilesApprox=None,
                         h2oQuantilesExact=None,
                         h2oExecQuantiles=None,
                         interpolate='linear',
                         quantile=0.50,
                         use_genfromtxt=False):
    SCIPY_INSTALLED = True
    try:
        import scipy as sp
        import numpy as np
        print "Both numpy and scipy are installed. Will do extra checks"

    except ImportError:
        print "numpy or scipy is not installed. Will only do sort-based checking"
        SCIPY_INSTALLED = False

    if not SCIPY_INSTALLED:
        return

    if use_genfromtxt:
        print "Using numpy.genfromtxt. Better handling of null bytes"
        target = np.genfromtxt(open(csvPathname, 'r'),
                               delimiter=',',
                               skip_header=1 if skipHeader else 0,
                               dtype=None)  # guess!
        # print "shape:", target.shape()

    else:
        print "Using python csv reader"
        target = h2o_util.file_read_csv_col(csvPathname,
                                            col=col,
                                            datatype=datatype,
                                            skipHeader=skipHeader,
                                            preview=5)

    if datatype == 'float':
        # to make irene's R runif files first col work (quoted row numbers, integers
        #shouldn't hurt anyone else?
        # strip " from left (ignore leading whitespace
        # strip " from right (ignore leading whitespace
        targetFP = map(float, target)
        # targetFP= np.array(tFP, np.float)
    if datatype == 'int':
        targetFP = map(int, target)

    # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
    # numpy.percentile has simple linear interpolate and midpoint
    # need numpy 1.9 for interpolation. numpy 1.8 doesn't have
    # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint')
    # 1.8
    p = np.percentile(targetFP, quantile * 100)
    h2p.red_print("numpy.percentile", p)

    # per = [100 * t for t in thresholds]
    from scipy import stats
    s1 = stats.scoreatpercentile(targetFP, quantile * 100)
    h2p.red_print("scipy stats.scoreatpercentile", s1)

    # scipy apparently doesn't have the use of means (type 2)
    # http://en.wikipedia.org/wiki/Quantile
    # it has median (R-8) with 1/3, 1/3

    if 1 == 0:
        # type 6
        alphap = 0
        betap = 0

        # type 5 okay but not perfect
        alphap = 0.5
        betap = 0.5

        # type 8
        alphap = 1 / 3.0
        betap = 1 / 3.0

    if interpolate == 'mean':
        # an approx? (was good when comparing to h2o type 2)
        alphap = 0.4
        betap = 0.4

    if interpolate == 'linear':
        # this is type 7
        alphap = 1
        betap = 1

    s2List = stats.mstats.mquantiles(targetFP,
                                     prob=quantile,
                                     alphap=alphap,
                                     betap=betap)
    s2 = s2List[0]
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html
    # type 7
    # alphap=0.4, betap=0.4,
    # type 2 not available? (mean)
    # alphap=1/3.0, betap=1/3.0 is approx median?
    h2p.red_print("scipy stats.mstats.mquantiles:", s2)

    # also get the median with a painful sort (h2o_summ.percentileOnSortedlist()
    # inplace sort
    targetFP.sort()

    # this matches scipy type 7 (linear)
    # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear')
    # this matches h2o type 2 (mean)
    # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean')

    b = percentileOnSortedList(targetFP, quantile, interpolate='linear')
    label = str(quantile * 100) + '%'
    h2p.blue_print(label, "from sort:", b)
    h2p.blue_print(label, "from numpy:", p)
    h2p.blue_print(label, "from scipy 1:", s1)
    h2p.blue_print(label, "from scipy 2:", s2)
    h2p.blue_print(label, "from h2o summary:", h2oSummary2)
    h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles)

    # they should be identical. keep a tight absolute tolerance
    # Note the comparisons have different tolerances, some are relative, some are absolute
    if h2oQuantilesExact:
        if math.isnan(float(h2oQuantilesExact)):
            raise Exception("h2oQuantilesExact is unexpectedly NaN %s" %
                            h2oQuantilesExact)
        h2o_util.assertApproxEqual(
            h2oQuantilesExact,
            b,
            tol=0.0000002,
            msg='h2o quantile multipass is not approx. same as sort algo')

    if h2oQuantilesApprox:
        # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN
        if math.isnan(float(h2oQuantilesApprox)):
            raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" %
                            h2oQuantilesApprox)
        if h2oSummary2MaxErr:
            h2o_util.assertApproxEqual(
                h2oQuantilesApprox,
                b,
                tol=h2oSummary2MaxErr,
                msg='h2o quantile singlepass is not approx. same as sort algo')
        else:
            h2o_util.assertApproxEqual(
                h2oQuantilesApprox,
                b,
                rel=0.1,
                msg='h2o quantile singlepass is not approx. same as sort algo')

    if h2oSummary2:
        if math.isnan(float(h2oSummary2)):
            raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2)
        if h2oSummary2MaxErr:
            # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2))
            h2o_util.assertApproxEqual(
                h2oSummary2,
                b,
                tol=h2oSummary2MaxErr,
                msg=
                'h2o summary2 is not approx. same as sort algo (calculated expected max error)'
            )
        else:
            # bounds are way off, since it depends on the min/max of the col, not the expected value
            h2o_util.assertApproxEqual(
                h2oSummary2,
                b,
                rel=1.0,
                msg=
                'h2o summary2 is not approx. same as sort algo (sloppy compare)'
            )

    if h2oQuantilesApprox and h2oSummary2:
        # they should both get the same answer. Currently they have different code, but same algo
        # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases.
        # not sure why..maybe some subtle algo diff.
        h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04,
            msg='h2o summary2 is not approx. same as h2o singlepass.'+\
                ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation')

    if h2oExecQuantiles:
        if math.isnan(float(h2oExecQuantiles)):
            raise Exception("h2oExecQuantiles is unexpectedly NaN %s" %
                            h2oExecQuantiles)
        # bounds are way off
        h2o_util.assertApproxEqual(
            h2oExecQuantiles,
            b,
            rel=1.0,
            msg='h2o summary2 is not approx. same as sort algo')

    if SCIPY_INSTALLED:
        if h2oQuantilesExact:
            h2o_util.assertApproxEqual(
                h2oQuantilesExact,
                p,
                tol=0.0000002,
                msg='h2o quantile multipass is not same as numpy.percentile')
            h2o_util.assertApproxEqual(
                h2oQuantilesExact,
                s1,
                tol=0.0000002,
                msg=
                'h2o quantile multipass is not same as scipy stats.scoreatpercentile'
            )

        # give us some slack compared to the scipy use of median (instead of desired mean)
        # since we don't have bounds here like above, just stop this test for now
        if h2oQuantilesApprox and 1 == 0:
            if interpolate == 'mean':
                h2o_util.assertApproxEqual(
                    h2oQuantilesApprox,
                    s2,
                    rel=0.5,
                    msg=
                    'h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles'
                )
            else:
                h2o_util.assertApproxEqual(
                    h2oQuantilesApprox,
                    s2,
                    rel=0.5,
                    msg=
                    'h2o quantile singlepass is not same as scipy stats.mstats.mquantiles'
                )

        # see if scipy changes. nope. it doesn't
        if 1 == 0:
            a = stats.mstats.mquantiles(targetFP,
                                        prob=quantile,
                                        alphap=alphap,
                                        betap=betap)
            h2p.red_print("after sort")
            h2p.red_print("scipy stats.mstats.mquantiles:", s3)
Example #26
0
    def test_summary2_uniform(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (5000000, 1, 'x.hex', 1, 20000,        ('C1',  1.10, 5000.0, 10000.0, 15000.0, 20000.00)),
            (5000000, 1, 'x.hex', -5000, 0,        ('C1', -5001.00, -3750.0, -2445, -1200.0, 99)),
            (1000000, 1, 'x.hex', -100000, 100000, ('C1',  -100001.0, -50000.0, 1613.0, 50000.0, 100000.0)),
            (1000000, 1, 'x.hex', -1, 1,           ('C1',  -1.05, -0.48, 0.0087, 0.50, 1.00)),

            (1000000, 1, 'A.hex', 1, 100,          ('C1',   1.05, 26.00, 51.00, 76.00, 100.0)),
            (1000000, 1, 'A.hex', -99, 99,         ('C1',  -99, -50.0, 0, 50.00, 99)),

            (1000000, 1, 'B.hex', 1, 10000,        ('C1',   1.05, 2501.00, 5001.00, 7501.00, 10000.00)),
            (1000000, 1, 'B.hex', -100, 100,       ('C1',  -100.10, -50.0, 0.85, 51.7, 100,00)),

            (1000000, 1, 'C.hex', 1, 100000,       ('C1',   1.05, 25002.00, 50002.00, 75002.00, 100000.00)),
            (1000000, 1, 'C.hex', -101, 101,       ('C1',  -100.10, -50.45, -1.18, 49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
            # max error = half the bin size?
        
            maxDelta = ((expectedMax - expectedMin)/20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            h2o.beta_features = False
            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
            h2o.beta_features = False
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["num_rows"]
            numCols = inspect["num_cols"]

            h2o.beta_features = True
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype= stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", twoDecimals(sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(pctile[3], expected[2], tol=maxDelta, msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(pctile[5], expected[3], tol=maxDelta, msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(pctile[7], expected[4], tol=maxDelta, msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows/len(hcnt) # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount, 
                    msg="Bins not right. b: %s e: %s" % (b, e))

            pt = twoDecimals(pctile)
            mx = twoDecimals(maxs)
            mn = twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, trial in enumerate(thresholds):
                execExpr = "quantile(%s[,1], c(%s));" % (hex_key, trial)
                (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                h2p.green_print("\nresultExec: %s" % h2o.dump_json(resultExec))
                ex = twoDecimals(result)
                h2p.blue_print("\nthreshold: %.2f Exec quantile: %s Summary2: %s" % (trial, ex, pt[i]))
                h2o_util.assertApproxEqual(result, pctile[i], tol=maxDelta, msg='percentile: % is not expected: %s' % (result, pctile[i]))

            if DO_TRY_SCIPY:
                generate_scipy_comparison(csvPathnameFull)
Example #27
0
def generate_scipy_comparison(csvPathname, col=0, h2oMedian=None, h2oMedian2=None):
    # this is some hack code for reading the csv and doing some percentile stuff in scipy
    # from numpy import loadtxt, genfromtxt, savetxt
    import numpy as np
    import scipy as sp

    dataset = np.genfromtxt(
        open(csvPathname, 'r'),
        delimiter=',',
        # skip_header=1,
        dtype=None); # guess!

    print "csv read for training, done"
    # we're going to strip just the last column for percentile work
    # used below
    NUMCLASSES = 10
    print "csv read for training, done"

    # data is last column
    # drop the output
    print dataset.shape
    if len(dataset.shape) > 1:
        target = [x[col] for x in dataset]
    else:
        target = dataset

    # we may have read it in as a string. coerce to number
    targetFP = np.array(target, np.float)

    if 1==0:
        n_features = len(dataset[0]) - 1;
        print "n_features:", n_features

        # get the end
        # target = [x[-1] for x in dataset]
        # get the 2nd col

        print "histogram of target"
        print target
        print sp.histogram(target, bins=NUMCLASSES)

        print target[0]
        print target[1]

    thresholds   = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999]
    print "scipy per:", thresholds
    from scipy import stats
    # a = stats.scoreatpercentile(target, per=per)
    a = stats.mstats.mquantiles(targetFP, prob=thresholds)
    a2 = ["%.2f" % v for v in a]
    h2p.red_print("scipy stats.mstats.mquantiles:", a2)

    # also get the median with a painful sort (h2o_summ.percentileOnSortedlist()
    # inplace sort
    targetFP.sort()
    b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear')
    label = '50%' if DO_MEDIAN else '99.9%'
    h2p.blue_print(label, "from sort:", b)
    s = a[5 if DO_MEDIAN else 10]
    h2p.blue_print(label, "from scipy:", s)
    h2p.blue_print(label, "from h2o summary2:", h2oMedian)
    h2p.blue_print(label, "from h2o quantile multipass:"******"%.2f" % v for v in a]
        h2p.red_print("after sort")
        h2p.red_print("scipy stats.mstats.mquantiles:", a2)
Example #28
0
def import_only(
    node=None,
    schema="local",
    bucket=None,
    path=None,
    timeoutSecs=30,
    retryDelaySecs=0.1,
    initialDelaySecs=0,
    pollTimeoutSecs=180,
    noise=None,
    benchmarkLogging=None,
    noPoll=False,
    doSummary=True,
    src_key=None,
    noPrint=False,
    importParentDir=True,
    **kwargs
):

    if src_key and schema != "put":
        raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node:
        node = h2o.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern) = ("", path)

    h2o.verboseprint("head:", head)
    h2o.verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
            raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
            raise Exception("h2o path %s can't be regex. path= was %s" % (head, path))

    if schema == "put":
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
            raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path))

        if not path:
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        h2o.verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o.python_test_name, "uses put:/%s" % filePath)
            h2p.green_print("Local path to file that will be uploaded: %s" % filePath)
            h2p.blue_print("That path resolves as:", os.path.realpath(filePath))

        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)

        # hmm.. what should importResult be in the put case
        # set it to None. No import is done, and shouldn't be used if you're doing schema='put'
        importResult = None

        return (None, key)

    if schema == "local" and not (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o.python_test_name, "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath))
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        folderURI = "nfs:/" + folderPath
        if importParentDir:
            importResult = node.import_files(folderPath, timeoutSecs=timeoutSecs)
        else:
            importResult = node.import_files(folderPath + "/" + pattern, timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head)
            head = head.lstrip("/")

        # strip leading / in head if present
        if bucket and head != "":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        print "\nimport_only:", h2o.python_test_name, schema, "uses", schema + "://" + folderOffset + "/" + pattern
        if h2o.abort_after_import:
            raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()")

        n = h2o.nodes[0]
        if schema == "s3" or node.redirect_import_folder_to_s3_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"

            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema == "s3n" or node.redirect_import_folder_to_s3n_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs,
                    n.hdfs_version,
                    n.hdfs_name_node,
                )
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema == "maprfs":
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        elif schema == "hdfs":
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs,
                    n.hdfs_version,
                    n.hdfs_name_node,
                )
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                importResult = node.import_files(folderURI, timeoutSecs=timeoutSecs)
            else:
                importResult = node.import_files(folderURI + "/" + pattern, timeoutSecs=timeoutSecs)

        else:
            raise Exception("schema not understood: %s" % schema)

    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)
    def test_exec2_quant_cmp_uniform(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            (500000, 1, 'x.hex', 1, 20000, ('C1', 1.10, 5000.0, 10000.0,
                                            15000.0, 20000.00)),
            (500000, 1, 'x.hex', -5000, 0, ('C1', -5001.00, -3750.0, -2445,
                                            -1200.0, 99)),
            (100000, 1, 'x.hex', -100000, 100000, ('C1', -100001.0, -50000.0,
                                                   1613.0, 50000.0, 100000.0)),
            (100000, 1, 'x.hex', -1, 1, ('C1', -1.05, -0.48, 0.0087, 0.50,
                                         1.00)),
            (100000, 1, 'A.hex', 1, 100, ('C1', 1.05, 26.00, 51.00, 76.00,
                                          100.0)),
            (100000, 1, 'A.hex', -99, 99, ('C1', -99, -50.0, 0, 50.00, 99)),
            (100000, 1, 'B.hex', 1, 10000, ('C1', 1.05, 2501.00, 5001.00,
                                            7501.00, 10000.00)),
            (100000, 1, 'B.hex', -100, 100, ('C1', -100.10, -50.0, 0.85, 51.7,
                                             100, 00)),
            (100000, 1, 'C.hex', 1, 100000, ('C1', 1.05, 25002.00, 50002.00,
                                             75002.00, 100000.00)),
            (100000, 1, 'C.hex', -101, 101, ('C1', -100.10, -50.45, -1.18,
                                             49.28, 100.00)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, expectedMin, expectedMax,
             expected) in tryList:
            # max error = half the bin size?

            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1
            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, expectedMin,
                              expectedMax, SEEDPERFILE)
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            print "Parse result['destination_key']:", parseResult[
                'destination_key']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename
            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            # only one column
            column = summaryResult['summaries'][0]

            colname = column['colname']
            self.assertEqual(colname, expected[0])

            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']

            # FIX! we should compare mean and sd to expected?
            mean = stats['mean']
            sd = stats['sd']
            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(
                mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(
                sd)

            zeros = stats['zeros']
            mins = stats['mins']
            h2o_util.assertApproxEqual(mins[0],
                                       expected[1],
                                       tol=maxDelta,
                                       msg='min is not approx. expected')
            maxs = stats['maxs']
            h2o_util.assertApproxEqual(maxs[0],
                                       expected[5],
                                       tol=maxDelta,
                                       msg='max is not approx. expected')

            pct = stats['pct']
            # the thresholds h2o used, should match what we expected
            expectedPct = [
                0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999
            ]
            pctile = stats['pctile']
            h2o_util.assertApproxEqual(
                pctile[3],
                expected[2],
                tol=maxDelta,
                msg='25th percentile is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[5],
                expected[3],
                tol=maxDelta,
                msg='50th percentile (median) is not approx. expected')
            h2o_util.assertApproxEqual(
                pctile[7],
                expected[4],
                tol=maxDelta,
                msg='75th percentile is not approx. expected')

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            print "pct:", pct
            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)
                # apparently we're not able to estimate for these datasets
                # self.assertAlmostEqual(b, rowCount/len(hcnt), delta=.01*rowCount,
                #     msg="Bins not right. b: %s e: %s" % (b, e))

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            print "min/25/50/75/max colname:", colname, "(2 places):", compareActual
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2p.blue_print("\nTrying exec quantile")
            # thresholds = "c(0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99)"
            # do the equivalent exec quantile?
            # execExpr = "quantile(%s[,1],%s);" % (hex_key, thresholds)

            print "Comparing (two places) each of the summary2 threshold quantile results, to single exec quantile"
            for i, threshold in enumerate(thresholds):
                # FIX! do two of the same?..use same one for the 2nd
                if i != 0:
                    # execExpr = "r2=c(1); r2=quantile(%s[,4],c(0,.05,0.3,0.55,0.7,0.95,0.99))" % hex_key
                    execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s,%s));" % (
                        hex_key, threshold, threshold)
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    h2p.green_print("\nresultExec: %s" %
                                    h2o.dump_json(resultExec))
                    h2p.blue_print(
                        "\nthreshold: %.2f Exec quantile: %s Summary2: %s" %
                        (threshold, result, pt[i]))
                    if not result:
                        raise Exception(
                            "exec result: %s for quantile: %s is bad" %
                            (result, threshold))
                    h2o_util.assertApproxEqual(
                        result,
                        pctile[i],
                        tol=maxDelta,
                        msg=
                        'exec percentile: %s too different from expected: %s' %
                        (result, pctile[i]))
                # for now, do one with all, but no checking
                else:
                    # This seemed to "work" but how do I get the key name for the list of values returned
                    # the browser result field seemed right, but nulls in the key
                    if 1 == 0:
                        execExpr = "r2=c(1); r2=quantile(%s[,1], c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    else:
                        # does this way work (column getting)j
                        execExpr = "r2=c(1); r2=quantile(%s$C1, c(%s));" % (
                            hex_key, ",".join(map(str, thresholds)))
                    (resultExec, result) = h2e.exec_expr(execExpr=execExpr,
                                                         timeoutSecs=30)
                    inspect = h2o_cmd.runInspect(key='r2')
                    numCols = inspect['numCols']
                    numRows = inspect['numRows']

                    self.assertEqual(numCols, 1)
                    self.assertEqual(numRows, len(thresholds))
                    # FIX! should run thru the values in the col? how to get

            # compare the last one
            if colname != '':
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=0,  # what col to extract from the csv
                    datatype='float',
                    quantile=thresholds[-1],
                    # h2oSummary2=pctile[-1],
                    # h2oQuantilesApprox=result, # from exec
                    h2oExecQuantiles=result,
                )

            h2o.nodes[0].remove_all_keys()
Example #30
0
# this is type 7
alphap = 1
betap = 1

from scipy import stats
a1 = stats.scoreatpercentile(target,
                             per=100 * OTHER_T,
                             interpolation_method='fraction')
h2p.red_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP,
                             prob=[OTHER_T],
                             alphap=alphap,
                             betap=betap)
h2p.red_print("scipy stats.mstats.mquantiles:", a2)
targetFP.sort()
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.red_print("sort algo:", b)
h2p.red_print("from h2o (multi):", quantiles[0])

print "Now looking at the sorted list..same thing"
h2p.blue_print("stats.scoreatpercentile:", a1)
a2 = stats.mstats.mquantiles(targetFP,
                             prob=[OTHER_T],
                             alphap=alphap,
                             betap=betap)
h2p.blue_print("scipy stats.mstats.mquantiles:", a2)
b = h2o_summ.percentileOnSortedList(targetFP, OTHER_T, interpolate='linear')
h2p.blue_print("sort algo:", b)
h2p.blue_print("from h2o (multi):", quantiles[0])
Example #31
0
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', 
    h2oSummary2=None, 
    h2oSummary2MaxErr=None,
    h2oQuantilesApprox=None, h2oQuantilesExact=None, 
    h2oExecQuantiles=None,
    interpolate='linear', quantile=0.50, use_genfromtxt=False):
    SCIPY_INSTALLED = False
    try:
        import scipy as sp
        import numpy as np
        print "Both numpy and scipy are installed. Will do extra checks"

    except ImportError:
        print "numpy or scipy is not installed. Will only do sort-based checking"
        SCIPY_INSTALLED = False

    if use_genfromtxt and SCIPY_INSTALLED:
            print "Using numpy.genfromtxt. Better handling of null bytes"
            target = np.genfromtxt(
                open(csvPathname, 'r'),
                delimiter=',',
                skip_header=1 if skipHeader else 0,
                dtype=None) # guess!
            # print "shape:", target.shape()

    else:
        print "Using python csv reader"
        target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype,
            skipHeader=skipHeader, preview=5)

    if datatype=='float':
        # to make irene's R runif files first col work (quoted row numbers, integers
        #shouldn't hurt anyone else?
        # strip " from left (ignore leading whitespace
        # strip " from right (ignore leading whitespace
        targetFP = map(float, target)
        # targetFP= np.array(tFP, np.float)
    if datatype=='int':
        targetFP = map(int, target)

    if SCIPY_INSTALLED:
        # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html
        # numpy.percentile has simple linear interpolate and midpoint
        # need numpy 1.9 for interpolation. numpy 1.8 doesn't have
        # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint')
        # 1.8
        p = np.percentile(targetFP, quantile*100)
        h2p.red_print("numpy.percentile", p)

        # per = [100 * t for t in thresholds]
        from scipy import stats
        s1 = stats.scoreatpercentile(targetFP, quantile*100)
        h2p.red_print("scipy stats.scoreatpercentile", s1)

        # scipy apparently doesn't have the use of means (type 2)
        # http://en.wikipedia.org/wiki/Quantile
        # it has median (R-8) with 1/3, 1/3

        if 1==0:
            # type 6
            alphap=0
            betap=0

            # type 5 okay but not perfect
            alphap=0.5
            betap=0.5

            # type 8
            alphap=1/3.0
            betap=1/3.0

        if interpolate=='mean':
            # an approx? (was good when comparing to h2o type 2)
            alphap=0.4
            betap=0.4

        if interpolate=='linear':
            # this is type 7
            alphap=1
            betap=1

        s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap)
        s2 = s2List[0]
        # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html
        # type 7 
        # alphap=0.4, betap=0.4, 
        # type 2 not available? (mean)
        # alphap=1/3.0, betap=1/3.0 is approx median?
        h2p.red_print("scipy stats.mstats.mquantiles:", s2)

    # also get the median with a painful sort (h2o_summ.percentileOnSortedlist()
    # inplace sort
    targetFP.sort()

    # this matches scipy type 7 (linear)
    # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear')
    # this matches h2o type 2 (mean)
    # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean')

    b = percentileOnSortedList(targetFP, quantile, interpolate='linear')
    label = str(quantile * 100) + '%'
    h2p.blue_print(label, "from sort:", b)

    if SCIPY_INSTALLED:
        h2p.blue_print(label, "from numpy:", p)
        h2p.blue_print(label, "from scipy 1:", s1)
        h2p.blue_print(label, "from scipy 2:", s2)

    h2p.blue_print(label, "from h2o summary:", h2oSummary2)
    h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles)

    # they should be identical. keep a tight absolute tolerance
    # Note the comparisons have different tolerances, some are relative, some are absolute
    if h2oQuantilesExact:
        if math.isnan(float(h2oQuantilesExact)):
            raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact)
        h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, 
            msg='h2o quantile multipass is not approx. same as sort algo')

    if h2oQuantilesApprox:
        # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN
        if math.isnan(float(h2oQuantilesApprox)):
            raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox)
        if h2oSummary2MaxErr:
            h2o_util.assertApproxEqual(h2oQuantilesApprox, b, tol=h2oSummary2MaxErr,
                msg='h2o quantile singlepass is not approx. same as sort algo')
        else:
            h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.1,
                msg='h2o quantile singlepass is not approx. same as sort algo')

    if h2oSummary2:
        if math.isnan(float(h2oSummary2)):
            raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2)
        if h2oSummary2MaxErr:
            # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2))
            h2o_util.assertApproxEqual(h2oSummary2, b, tol=h2oSummary2MaxErr,
                msg='h2o summary2 is not approx. same as sort algo (calculated expected max error)')
        else:
            # bounds are way off, since it depends on the min/max of the col, not the expected value
            h2o_util.assertApproxEqual(h2oSummary2, b, rel=1.0,
                msg='h2o summary2 is not approx. same as sort algo (sloppy compare)')

    if h2oQuantilesApprox and h2oSummary2:
        # they should both get the same answer. Currently they have different code, but same algo
        # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases.
        # not sure why..maybe some subtle algo diff.
        h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04,
            msg='h2o summary2 is not approx. same as h2o singlepass.'+\
                ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation')

    if h2oExecQuantiles:
        if math.isnan(float(h2oExecQuantiles)):
            raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles)
        # bounds are way off
        h2o_util.assertApproxEqual(h2oExecQuantiles, b, rel=1.0,
            msg='h2o summary2 is not approx. same as sort algo')

    if SCIPY_INSTALLED:
        if h2oQuantilesExact:
            h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002,
                msg='h2o quantile multipass is not same as numpy.percentile')
            h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002,
                msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile')

        # give us some slack compared to the scipy use of median (instead of desired mean)
        # since we don't have bounds here like above, just stop this test for now
        if h2oQuantilesApprox and 1==0:
            if interpolate=='mean':
                h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5,
                    msg='h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles')
            else:
                h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5,
                    msg='h2o quantile singlepass is not same as scipy stats.mstats.mquantiles')

        # see if scipy changes. nope. it doesn't 
        if 1==0:
            a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap)
            h2p.red_print("after sort")
            h2p.red_print("scipy stats.mstats.mquantiles:", s3)
Example #32
0
def findQuantile(d, dmin, dmax, threshold):
    # return the value at the threshold, or the mean of the two rows that bound it.
    # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer
    maxIterations = 30

    # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere)
    totalRows = len(d)
    # Used to have
    desiredBinCnt = BIN_COUNT
    maxBinCnt = desiredBinCnt + 1  # might go one over due to FP issues

    # initialize
    newValStart = dmin
    newValEnd = dmax
    newValRange = newValEnd - newValStart
    desiredBinCnt = BIN_COUNT  # Could do per-pass adjustment, but fixed works fine.
    newValBinSize = newValRange / (desiredBinCnt + 0.0)
    newLowCount = 0  # count of rows below the bins
    # yes there is no newHighCount. Created during the pass, though.

    # state shared by each pass
    assert maxBinCnt > 0

    hcnt2 = [None for b in range(maxBinCnt)]
    hcnt2_min = [None for b in range(maxBinCnt)]
    hcnt2_max = [None for b in range(maxBinCnt)]
    hcnt2_low = 0
    hcnt2_high = 0

    assert newValBinSize != 0  # can be negative
    assert newValEnd > newValStart
    assert newValRange > 0

    # break out on stopping condition
    # reuse the histogram array hcnt2[]
    iteration = 0
    done = False
    # append to a list of best guesses per pass
    best_result = []

    def htot2():
        return sum(hcnt2) + hcnt2_low + hcnt2_high

    while iteration <= maxIterations and not done:
        h2p.green_print("newValStart", newValStart)
        h2p.green_print("newValEnd", newValEnd)
        h2p.green_print("newValRange", newValRange)
        h2p.green_print("newValBinSize", newValBinSize)
        h2p.green_print("newLowCount", newLowCount)
        h2p.green_print("threshold", threshold)

        valStart = newValStart
        valEnd = newValEnd
        valRange = newValRange
        valBinSize = newValBinSize
        lowCount = newLowCount
        desiredBinCnt = BIN_COUNT
        maxBinCnt = desiredBinCnt + 1  # might go one over due to FP issues

        # playing with creating relative NUDGE values to make sure bin range
        # is always inclusive of target.
        # ratio it down from valBinSize.
        # It doesn't need to be as big as valBinSize.
        # implicitly, it shouldn't need to be as large as valBinSize
        # can't seem to make it work yet. leave NUDGE=0
        NUDGE = 0

        # init to zero for each pass
        for b in range(maxBinCnt):
            hcnt2[b] = 0.0

        # Init counts outside of the bins
        hcnt2_low = 0
        hcnt2_high = 0

        # minimum value for higher than the bin. Needed for interpolation
        hcnt2_high_min = None

        for val in d:
            # Need to count the stuff outside the bin-gathering,
            # since threshold compare is based on total row compare
            # on first pass, shouldn't see anything exceed the start/end bounds
            # since those are min/max for the column? (shouldn't be any fp precision issue? or ??)
            # oh wait, this valOffset math creates possible precision issue?
            # maybe we should address it with the NUDGE value below? but what about first pass?
            valOffset = val - valStart
            # where are we zeroing in? (start)
            binIdx2 = int(math.floor(
                valOffset /
                (valBinSize + 0.0)))  # make sure it's always an fp divide?

            # do some close looking for possible fp arith issues
            cA = valOffset < 0
            cB = binIdx2 < 0
            t = {True: 1, False: 0}
            # we get the 10 case
            if ((cA and not cB) or (not cA and cB)):
                h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \
                    "binIdx2", binIdx2)
            cC = val > valEnd
            cD = binIdx2 >= (maxBinCnt - 1)  # tighten the compare for printing
            if ((cC and not cD) or (not cC and cD)):
                h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \
                    "binIdx2", binIdx2, "maxBinCnt", maxBinCnt)
                # example hits this case..i.e. the max value
                # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3

            if valOffset < 0 or binIdx2 < 0:
                # if valOffset < 0:
                # if binIdx2<0:
                hcnt2_low += 1
            # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure!
            # have to use both compares, since can wrap the index (due to start/end shift)
            # elif val > valEnd or binIdx2>=(maxBinCnt-1):
            # should this really be a valOffset compare?
            elif val > valEnd or binIdx2 >= maxBinCnt:
                # elif val > valEnd:
                # elif binIdx2>=(maxBinCnt-1):
                if (hcnt2_high == 0) or (val < hcnt2_high_min):
                    hcnt2_high_min = val
                    print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd
                hcnt2_high += 1
            else:
                # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize

                assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \
                    (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize)
                if hcnt2[binIdx2] == 0 or (val < hcnt2_min[binIdx2]):
                    hcnt2_min[binIdx2] = val
                if hcnt2[binIdx2] == 0 or (val > hcnt2_max[binIdx2]):
                    hcnt2_max[binIdx2] = val
                hcnt2[binIdx2] += 1

                # check if we went into the magic extra bin
                if binIdx2 == (maxBinCnt - 1):
                    print "\nFP! val went into the extra maxBinCnt bin:", \
                    binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n"

            # check the legal states for these two
            # we don't have None for checking hcnt2_high_min in java
            assert hcnt2_high == 0 or (hcnt2_high_min is not None)
            assert (hcnt2_high_min is None) or hcnt2_high != 0

        # everything should either be in low, the bins, or high
        totalBinnedRows = htot2()
        print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high)

        assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \
            (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high)

        # now walk thru and find out what bin to look inside
        currentCnt = hcnt2_low
        targetCntFull = threshold * (totalRows - 1)  # zero based indexing
        targetCntInt = int(math.floor(threshold * (totalRows - 1)))
        targetCntFract = targetCntFull - targetCntInt
        assert targetCntFract >= 0 and targetCntFract <= 1
        print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract

        k = 0
        while ((currentCnt + hcnt2[k]) <= targetCntInt):
            # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k]
            currentCnt += hcnt2[k]
            # ugly but have to break out if we'd cycle along with == adding h0's until we go too far
            # are we supposed to advance to a none zero bin?
            k += 1  # goes over in the equal case?
            # if currentCnt >= targetCntInt:
            #     break
            if k == maxBinCnt:
                break
            assert k < maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % (
                k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k - 1])

        # format string to match java Log.info() in Quantiles.java
        print "Found k (multi): ", k, " ", currentCnt, " ", targetCntInt, " ", totalRows, " ", hcnt2[
            k], " ", hcnt2_min[k], " ", hcnt2_max[k]
        assert hcnt2[k] != 1 or hcnt2_min[k] == hcnt2_max[k]

        # some possibily interpolating guesses first, in guess we have to iterate (best guess)
        done = False
        guess = (hcnt2_max[k] - hcnt2_min[k]) / 2

        # we maight not have gottent all the way
        if currentCnt == targetCntInt:
            if hcnt2[k] > 2 and (hcnt2_min[k] == hcnt2_max[k]):
                guess = hcnt2_min[k]
                print "Guess A", guess, k, hcnt2[k]

            if hcnt2[k] == 2:
                print "hello"
                print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop"
                # no mattter what size the fraction it would be on this number
                guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0
                # no mattter what size the fraction it would be on this number

                if INTERPOLATION_TYPE == 2:  # type 2 (mean)
                    guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0

                else:  # default to type 7 (linear interpolation)
                    # Unlike mean, which just depends on two adjacent values, this adjustment
                    # adds possible errors related to the arithmetic on the total # of rows.
                    dDiff = hcnt2_max[k] - hcnt2_min[
                        k]  # two adjacent..as if sorted!
                    pctDiff = targetCntFract  # This is the fraction of total rows
                    guess = hcnt2_min[k] + (pctDiff * dDiff)

                done = False
                print "Guess B", guess

            if hcnt2[k] == 1 and targetCntFract == 0:
                assert hcnt2_min[k] == hcnt2_max[k]
                guess = hcnt2_min[k]
                done = True
                print "k", k
                print "Guess C", guess

            if hcnt2[k] == 1 and targetCntFract != 0:
                assert hcnt2_min[k] == hcnt2_max[k]
                print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero"
                if k < maxBinCnt:
                    nextK = k + 1  # could put it over maxBinCnt
                else:
                    nextK = k
                while nextK < maxBinCnt and hcnt2[nextK] == 0:
                    nextK += 1

                # have the "extra bin" for this
                if nextK >= maxBinCnt:
                    assert hcnt2_high != 0
                    print "Using hcnt2_high_min for interpolate:", hcnt2_high_min
                    nextVal = hcnt2_high_min
                else:
                    print "Using nextK for interpolate:", nextK
                    assert hcnt2[nextK] != 0
                    nextVal = hcnt2_min[nextK]

                guess = (hcnt2_max[k] + nextVal) / 2.0
                # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK

                if INTERPOLATION_TYPE == 2:  # type 2 (mean)
                    guess = (hcnt2_max[k] + nextVal) / 2.0
                    pctDiff = 0.5
                else:  # default to type 7 (linear interpolation)
                    dDiff = nextVal - hcnt2_max[
                        k]  # two adjacent, as if sorted!
                    pctDiff = targetCntFract  # This is the fraction of total rows
                    guess = hcnt2_max[k] + (pctDiff * dDiff)

                done = True  # has to be one above us when needed. (or we're at end)

                print 'k', 'hcnt2_max[k]', 'nextVal'
                print "hello3:", k, hcnt2_max[k], nextVal
                print "\nInterpolating result using nextK: %s nextVal: %s" % (
                    nextK, nextVal)
                print "Guess D", guess

        if not done:
            print "%s %s %s %s Not done, setting new range" % (hcnt2[k], currentCnt, targetCntInt, targetCntFract),\
                "k: ", k,\
                "currentCnt: ", currentCnt,\
                "hcnt2_min[k]: ", hcnt2_min[k],\
                "hcnt2_max[k]: ", hcnt2_max[k]

            # possible bin leakage at start/end edges due to fp arith.
            # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare
            # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you.
            # Just need to check the one bin below and above k, if they exist.
            if k > 0 and hcnt2[k - 1] > 0 and (hcnt2_max[k - 1] <
                                               hcnt2_min[k]):
                print "1"
                newValStart = hcnt2_max[k - 1]
            else:
                print "2"
                newValStart = hcnt2_min[k]

            # subtle. we do put stuff in the extra end bin (see the print above that happens)
            # k might be pointing to one less than that (like k=0 for 1 bin case)
            if k < maxBinCnt and hcnt2[k + 1] > 0 and (hcnt2_min[k + 1] >
                                                       hcnt2_max[k]):
                print "3"
                newValEnd = hcnt2_min[k + 1]
            else:
                print "4"
                newValEnd = hcnt2_max[k]

            newValRange = newValEnd - newValStart
            # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues?
            newValBinSize = newValRange / (desiredBinCnt + 0.0)

            # the start/end should never change if we're just using one bin
            # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations
            # assumes NUDGE is 0
            if NUDGE == 0.0:
                assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\
                    "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd)
            newLowCount = currentCnt
            if newValBinSize == 0:
                # assert done or newValBinSize!=0 and live with current guess
                print "Assuming done because newValBinSize is 0."
                print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\
                     (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k])
                guess = newValStart
                print "Guess E", guess
                # was done = True 3/20/14
                done = True

            # if we have to interpolate
            # if it falls into this bin, interpolate to this bin means one answer?

            # cover the case above with multiple entries in a bin, all the same value
            # will be zero on the last pass?
            # assert newValBinSize != 0 or done
            # need the count up to but not including newValStart

        best_result.append(guess)
        iteration += 1

        h2p.blue_print("Ending Pass", iteration)
        h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]",
                       hcnt2[k])
        print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high
        print "was", valStart, valEnd, valRange, valBinSize
        print "next", newValStart, newValEnd, newValRange, newValBinSize

    return best_result[-1]
Example #33
0
    def test_summary2_small(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # colname, (min, 25th, 50th, 75th, max)
            # if rowCount is None, we'll just use  the data values
            # None in expected values means no compare
            (None, 1, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 2, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 10, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 100, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            (None, 1000, "x.hex", [-1, 0, 1], ("C1", None, None, 0, None, None)),
            # (None, 10000, 'x.hex', [-1,0,1],        ('C1',  None, None, 0, None, None)),
            # (COLS, 1, 'x.hex', [1,0,-1],        ('C1',  None, None, None, None, None)),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, values, expected) in tryList:
            # max error = half the bin size?

            expectedMax = max(values)
            expectedMin = min(values)
            maxDelta = ((expectedMax - expectedMin) / 20.0) / 2.0
            # add 5% for fp errors?
            maxDelta = 1.05 * maxDelta

            # hmm...say we should be 100% accurate for these tests?
            maxDelta = 0

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            if not rowCount:
                rowFile = len(values)
            else:
                rowFile = rowCount
            csvFilename = "syn_" + "binary" + "_" + str(rowFile) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, values, SEEDPERFILE)

            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=30, doSummary=False
            )
            print "Parse result['destination_key']:", parseResult["destination_key"]

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvFilename

            numRows = inspect["numRows"]
            numCols = inspect["numCols"]

            summaryResult = h2o_cmd.runSummary(key=hex_key, max_qbins=MAX_QBINS, timeoutSecs=45)
            h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

            quantile = 0.5 if DO_MEDIAN else 0.999
            q = h2o.nodes[0].quantiles(
                source_key=hex_key,
                column=0,
                interpolation_type=7,
                quantile=quantile,
                max_qbins=MAX_QBINS,
                multiple_pass=2,
            )
            qresult = q["result"]
            qresult_single = q["result_single"]
            qresult_iterations = q["iterations"]
            qresult_interpolated = q["interpolated"]
            h2p.blue_print("h2o quantiles result:", qresult)
            h2p.blue_print("h2o quantiles result_single:", qresult_single)
            h2p.blue_print("h2o quantiles iterations:", qresult_iterations)
            h2p.blue_print("h2o quantiles interpolated:", qresult_interpolated)
            print h2o.dump_json(q)

            self.assertLess(
                qresult_iterations,
                16,
                msg="h2o does max of 16 iterations. likely no result_single if we hit max. is bins=1?",
            )

            # only one column
            column = summaryResult["summaries"][0]

            colname = column["colname"]

            coltype = column["type"]
            nacnt = column["nacnt"]

            stats = column["stats"]
            stattype = stats["type"]

            # FIX! we should compare mean and sd to expected?
            mean = stats["mean"]
            sd = stats["sd"]

            print "colname:", colname, "mean (2 places):", h2o_util.twoDecimals(mean)
            print "colname:", colname, "std dev. (2 places):", h2o_util.twoDecimals(sd)

            zeros = stats["zeros"]
            mins = stats["mins"]
            maxs = stats["maxs"]
            pct = stats["pct"]
            # the thresholds h2o used, should match what we expected
            expectedPct = [0.01, 0.05, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.95, 0.99]

            pctile = stats["pctile"]
            print "pctile:", pctile
            if expected[0]:
                self.assertEqual(colname, expected[0])
            if expected[1]:
                h2o_util.assertApproxEqual(mins[0], expected[1], tol=maxDelta, msg="min is not approx. expected")
            if expected[2]:
                h2o_util.assertApproxEqual(
                    pctile[3], expected[2], tol=maxDelta, msg="25th percentile is not approx. expected"
                )
            if expected[3]:
                h2o_util.assertApproxEqual(
                    pctile[5], expected[3], tol=maxDelta, msg="50th percentile (median) is not approx. expected"
                )
            if expected[4]:
                h2o_util.assertApproxEqual(
                    pctile[7], expected[4], tol=maxDelta, msg="75th percentile is not approx. expected"
                )
            if expected[5]:
                h2o_util.assertApproxEqual(maxs[0], expected[5], tol=maxDelta, msg="max is not approx. expected")

            hstart = column["hstart"]
            hstep = column["hstep"]
            hbrk = column["hbrk"]
            hcnt = column["hcnt"]

            print "pct:", pct
            print ""

            print "hcnt:", hcnt
            print "len(hcnt)", len(hcnt)

            # don't check the last bin
            for b in hcnt[1:-1]:
                # should we be able to check for a uniform distribution in the files?
                e = numRows / len(hcnt)  # expect 21 thresholds, so 20 bins. each 5% of rows (uniform distribution)
                # don't check the edge bins
                self.assertAlmostEqual(
                    b, numRows / len(hcnt), delta=1 + 0.01 * numRows, msg="Bins not right. b: %s e: %s" % (b, e)
                )

            pt = h2o_util.twoDecimals(pctile)
            mx = h2o_util.twoDecimals(maxs)
            mn = h2o_util.twoDecimals(mins)
            print "colname:", colname, "pctile (2 places):", pt
            print "colname:", colname, "maxs: (2 places):", mx
            print "colname:", colname, "mins: (2 places):", mn

            # FIX! we should do an exec and compare using the exec quantile too
            compareActual = mn[0], pt[3], pt[5], pt[7], mx[0]
            h2p.green_print("min/25/50/75/max colname:", colname, "(2 places):", compareActual)
            print "maxs colname:", colname, "(2 places):", mx
            print "mins colname:", colname, "(2 places):", mn

            trial += 1
            h2o.nodes[0].remove_all_keys()

            scipyCol = 0

            # don't check if colname is empty..means it's a string and scipy doesn't parse right?
            if colname != "":
                # don't do for enums
                # also get the median with a sort (h2o_summ.percentileOnSortedlist()
                h2o_summ.quantile_comparisons(
                    csvPathnameFull,
                    col=scipyCol,  # what col to extract from the csv
                    datatype="float",
                    quantile=0.5 if DO_MEDIAN else 0.999,
                    h2oSummary2=pctile[5 if DO_MEDIAN else 10],
                    # h2oQuantilesApprox=qresult_single,
                    h2oQuantilesExact=qresult,
                )
Example #34
0
def import_only(node=None,
                schema='local',
                bucket=None,
                path=None,
                timeoutSecs=30,
                retryDelaySecs=0.1,
                initialDelaySecs=0,
                pollTimeoutSecs=180,
                noise=None,
                benchmarkLogging=None,
                noPoll=False,
                doSummary=True,
                src_key=None,
                noPrint=False,
                importParentDir=True,
                **kwargs):

    # FIX! hack all put to local, since h2o-dev doesn't have put yet?
    # multi-machine put will fail as a result.

    # if schema=='put':
    #    h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." +
    #        "\nMeans multi-machine with 'put' will fail")
    #    schema = 'local'

    if src_key and schema != 'put':
        raise Exception(
            "can only specify a 'src_key' param for schema='put'. You have %s %s"
            % (schema, src_key))

    # no bucket is sometimes legal (fixed path)
    if not node: node = h2o_nodes.nodes[0]

    if path is None:
        raise Exception("import_only: path parameter needs to be specified")

    if "/" in path:
        (head, pattern) = os.path.split(path)
    else:
        (head, pattern) = ("", path)

    verboseprint("head:", head)
    verboseprint("pattern:", pattern)

    # to train users / okay here
    # normally we import the folder above, but if we import exactly, the path can't have regex
    # the folder can't have regex in any case
    if importParentDir:
        if re.search(r"[\*<>{}[\]~`]", head):
            raise Exception("h2o folder path %s can't be regex. path= was %s" %
                            (head, path))
    else:
        if re.search(r"[\*<>{}[\]~`]", path):
            raise Exception("h2o path %s can't be regex. path= was %s" %
                            (head, path))

    if schema == 'put':
        # to train users
        if re.search(r"[/\*<>{}[\]~`]", pattern):
            raise Exception(
                "h2o putfile basename %s can't be regex. path= was %s" %
                (pattern, path))

        if not path:
            raise Exception("path= didn't say what file to put")

        (folderPath, filename) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, filename)
        verboseprint("put filename:", filename, "folderPath:", folderPath,
                     "filePath:", filePath)

        if not noPrint:
            h2p.green_print("\nimport_only:", h2o_args.python_test_name,
                            "uses put:/%s" % filePath)
            h2p.green_print("Local path to file that will be uploaded: %s" %
                            filePath)
            h2p.blue_print("That path resolves as:",
                           os.path.realpath(filePath))

        if h2o_args.abort_after_import:
            raise Exception(
                "Aborting due to abort_after_import (-aai) argument's effect in import_only()"
            )

        # h2o-dev: it always wants a key name
        if src_key is None:
            src_key = filename
        key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs)

        # hmm.. what should importResult be in the put case
        # set it to None. No import is done, and shouldn't be used if you're doing schema='put'
        # ..make it look like an import files result..This is just for test consistency
        importResult = json.loads('{\
          "dels": [],\
          "fails": [],\
          "files": ["%s"],\
          "keys": ["%s"],\
          "path": "%s",\
          "schema_name": null, "schema_type": null, "schema_version": null\
        }' % (filename, src_key, filePath))
        return (importResult, key)

    if schema=='local' and not \
            (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path):
        (folderPath, pattern) = find_folder_and_filename(bucket, path, schema)
        filePath = os.path.join(folderPath, pattern)
        h2p.green_print("\nimport_only:", h2o_args.python_test_name,
                        "uses local:/%s" % filePath)
        h2p.green_print("Path h2o will be told to use: %s" % filePath)
        h2p.blue_print("If local jvms, path resolves locally as:",
                       os.path.realpath(filePath))
        if h2o_args.abort_after_import:
            raise Exception(
                "Aborting due to abort_after_import (-aai) argument's effect in import_only()"
            )

        # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder?
        # is it used for key matching by others?

        # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path
        # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets
        # importPattern = folderURI + "/" + pattern
        # could include this on the entire importPattern if we no longer have regex basename in h2o-dev?

        folderURI = 'nfs:/' + folderPath
        # folderURI = 'nfs:/' + os.path.realpath(folderPath)
        if importParentDir:
            finalImportString = folderPath
        else:
            finalImportString = folderPath + "/" + pattern
        importResult = node.import_files(finalImportString,
                                         timeoutSecs=timeoutSecs)

    else:
        if bucket is not None and re.match("/", head):
            verboseprint("You said bucket:", bucket,
                         "so stripping incorrect leading '/' from", head)
            head = head.lstrip('/')

        # strip leading / in head if present
        if bucket and head != "":
            folderOffset = bucket + "/" + head
        elif bucket:
            folderOffset = bucket
        else:
            folderOffset = head

        if h2o_args.abort_after_import:
            raise Exception(
                "Aborting due to abort_after_import (-aai) argument's effect in import_only()"
            )

        n = h2o_nodes.nodes[0]
        if schema == 's3' or node.redirect_import_folder_to_s3_path:
            # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            folderURI = "s3://" + folderOffset
            if not n.aws_credentials:
                print "aws_credentials: %s" % n.aws_credentials
                # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built"

            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        elif schema == 's3n' or node.redirect_import_folder_to_s3n_path:
            # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o
            # should probably deal with this up in the bucket resolution
            # this may change other cases, but smalldata should only exist as a "bucket" for us?
            folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset)
            if not (n.use_hdfs and
                    ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built"
            folderURI = "s3n://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        elif schema == 'maprfs':
            if not n.use_maprfs:
                print "use_maprfs: %s" % n.use_maprfs
                # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built"
            # if I use the /// and default, the key names that get created by h2o only have 1 slash
            # so the parse doesn't find the key name
            if n.hdfs_name_node:
                folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                # folderURI = "maprfs:///" + folderOffset
                folderURI = "maprfs:/" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        elif schema == 'hdfs':
            # check that some state from the cloud building time was right
            # the requirements for this may change and require updating
            if not (n.use_hdfs and
                    ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)):
                print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (
                    n.use_hdfs, n.hdfs_version, n.hdfs_name_node)
                if n.hdfs_config:
                    print "hdfs_config: %s" % n.hdfs_config
                # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built")
                print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built"

            if n.hdfs_name_node:
                folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset
            else:
                # this is different than maprfs? normally we specify the name though
                folderURI = "hdfs://" + folderOffset
            if importParentDir:
                finalImportString = folderURI
            else:
                finalImportString = folderURI + "/" + pattern
            importResult = node.import_files(finalImportString,
                                             timeoutSecs=timeoutSecs)

        else:
            raise Exception("schema not understood: %s" % schema)

    print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString
    importPattern = folderURI + "/" + pattern
    return (importResult, importPattern)