Exemple #1
0
    def test_exec2_runif(self):
        print "in h2o-dev, params are column, min, max, seed"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            # params for h2o-dev runif are: column, min, max, seed
            AssignObj('r0.hex', KeyIndexed('r.hex', col=0)),
            AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0),
                                    1)),
            AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1),
                                    -1)),
            AssignObj('s2.hex',
                      Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)),
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            results.append(result)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1 == 0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [
                0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567,
                1859.0, 1859.0
            ])
    def test_exec2_runif(self):
        print "in h2o-dev, params are column, min, max, seed"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            # params for h2o-dev runif are: column, min, max, seed 
            AssignObj('r0.hex', KeyIndexed('r.hex',col=0) ),
            AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1) ),
            AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)  ),
            AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1) ),
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            results.append(result)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1==0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)
        

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
    def test_exec2_reduction(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if getpass.getuser()=='jenkins':
            csvPathname = 'standard/billion_rows.csv.gz'
        else:
            csvPathname = '1B/reals_1B_15f.data'
            csvPathname = '1B/reals_100000x1000_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='local', 
            hex_key=hex_key, timeoutSecs=3000, retryDelaySecs=2)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)

        for execExpr in initList:
            result = execExpr.do(timeoutSecs=30)

        for execExpr in exprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()
    def test_exec2_reduction(self):
        bucket = 'home-0xdiag-datasets'
        # csvPathname = 'airlines/year2013.csv'
        if getpass.getuser() == 'jenkins':
            csvPathname = 'standard/billion_rows.csv.gz'
        else:
            csvPathname = '1B/reals_1B_15f.data'
            csvPathname = '1B/reals_100000x1000_15f.data'

        hex_key = 'r1'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=3000,
                                       retryDelaySecs=2)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(
            inspect)

        for execExpr in initList:
            result = execExpr.do(timeoutSecs=30)

        for execExpr in exprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()
    def test_parse_file_loop(self):
        lenNodes = len(h2o.nodes)
        trial = 0
        for i in range(2):
            for j in range(1,10):
                # spread the parse around the nodes. Note that keys are produced by H2O, so keys not resused
                nodeX = random.randint(0,lenNodes-1) 
                parseResult= h2i.import_parse(node=h2o.nodes[nodeX],
                    bucket='smalldata', path='logreg/prostate.csv', schema='put')
                trial += 1

            # dump some cloud info so we can see keys?
            print "\nAt trial #" + str(trial)
            c = h2o.nodes[0].get_cloud()
            print (h2o.dump_json(c))
    def test_parse_file_loop(self):
        lenNodes = len(h2o.nodes)
        trial = 0
        for i in range(2):
            for j in range(1, 10):
                # spread the parse around the nodes. Note that keys are produced by H2O, so keys not resused
                nodeX = random.randint(0, lenNodes - 1)
                parseResult = h2i.import_parse(node=h2o.nodes[nodeX],
                                               bucket='smalldata',
                                               path='logreg/prostate.csv',
                                               schema='put')
                trial += 1

            # dump some cloud info so we can see keys?
            print "\nAt trial #" + str(trial)
            c = h2o.nodes[0].get_cloud()
            print(h2o.dump_json(c))
 def test_a_simple3(self):
     a = h2o.n0.endpoints()
     print h2o.dump_json(a)
     print "There are %s endpoints" % len(a["routes"])
     for l in a["routes"]:
         print l["url_pattern"]
    def test_GBMGrid_basic_many(self):
        trainFilename = 'prostate.csv'
        train_key = 'prostate.hex'
        timeoutSecs = 300
        csvPathname = "logreg/" + trainFilename
        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=train_key, schema='put')

        pA = h2o_cmd.ParseObj(parseResult)
        iA = h2o_cmd.InspectObj(pA.parse_key)
        parse_key = pA.parse_key
        numRows = iA.numRows
        numCols = iA.numCols
        labelList = iA.labelList

        labelListUsed = list(labelList)
        numColsUsed = numCols

        parameters = {
            'validation_frame': train_key,
            'ignored_columns': "['ID']", # this has to have []
            'response_column': 'CAPSULE',
            # 'balance_classes':
            # 'max_after_balance_size':
            # ??
            # 'ntrees': '[8, 10]',
            'ntrees': 8,
            # 'max_depth': '[8, 9]',
            'max_depth': 8,
            # ??
            # 'min_rows': '[1, 2]',
            'min_rows': 1,
            'nbins': 40,
            # ??
            # 'learn_rate': "[0.1, 0.2]",
            'learn_rate': 0.1,
            # FIX! doesn't like it?
            # 'loss': 'Bernoulli',
            # FIX..no variable importance for GBM yet?
            # 'variable_importance': False,
            # 'seed': 
        }

        jobs = []
        # kick off 5 of these GBM grid jobs, with different tree choices
        start = time.time()
        totalGBMGridJobs = 0

        for i in range(5):
            modelKey = 'GBMGrid_prostate_%s', i
            bmResult = h2o.n0.build_model(
                algo='gbm',
                destination_key=modelKey,
                training_frame=parse_key,
                parameters=parameters,
                timeoutSecs=60)
            bm = OutputObj(bmResult, 'bm')
            print "GBMResult:", h2o.dump_json(bm)

            # FIX! is this right for gridded? 
            job_key = bm.jobs[0].key.name
            # FIX! this isn't a full formed name (%)
            model_key = bm.jobs[0].dest.name
            jobs.append( (job_key, model_key) )
            totalGBMGridJobs += 1

        h2o_jobs.pollWaitJobs(timeoutSecs=300)
        elapsed = time.time() - start
        print "All GBM jobs completed in", elapsed, "seconds."
        print "totalGBMGridJobs:", totalGBMGridJobs

        for job_key, model_key in jobs:
            modelResult = h2o.n0.models(key=model_key)
            model = OutputObj(modelResult['models'][0]['output'], 'model')

            cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            cmm = OutputObj(cmmResult, 'cmm')
            print "\nLook!, can use dot notation: cmm.cm.confusion.matrix", cmm.cm.confusion_matrix, "\n"

            mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60)
            mmResultShort = mmResult['model_metrics'][0]
            del mmResultShort['frame'] # too much!
            mm = OutputObj(mmResultShort, 'mm')

            prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60)
            pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr')
Exemple #9
0
 def test_a_simple3(self):
     a = h2o.n0.endpoints()
     print h2o.dump_json(a)
     print "There are %s endpoints" % len(a['routes'])
     for l in a['routes']:
         print l['url_pattern']
    def test_hdfs_cdh5(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle"
            ("and-testing.data", 60),
            ### "arcene2_train.both",
            ### "arcene_train.both",
            ### "bestbuy_test.csv",
            ("covtype.data", 60),
            ("covtype4x.shuffle.data", 60),
            # "four_billion_rows.csv",
            ("hhp.unbalanced.012.data.gz", 60),
            ("hhp.unbalanced.data.gz", 60),
            ("leads.csv", 60),
            # ("covtype.169x.data", 1200),
            ("prostate_long_1G.csv", 200),
            ("airlines_all.csv", 1200),
        ]

        # pick 8 randomly!
        if (1==0):
            csvFilenameList = random.sample(csvFilenameAll,8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        trial = 0
        print "try importing /tmp2"
        d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000)
        for (csvFilename, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir 
            print "Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a.hex"
            csvPathname = "datasets/" + csvFilename

            parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000)
            print "hdfs parse of", csvPathname, "took", time.time() - start, 'secs'
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            if DO_EXPORT:
                start = time.time()
                print "Saving", csvFilename, 'to HDFS'
                print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
                print "Unique per-user to avoid permission issues"
                username = getpass.getuser()
                csvPathname = "tmp2/a%s.%s.csv" % (trial, username)
                # reuse the file name to avoid running out of space
                csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files', username)

                path = "hdfs://"+ h2o.nodes[0].hdfs_name_node + "/" + csvPathname
                h2o.nodes[0].export_files(src_key=hex_key, path=path, force=1, timeoutSecs=timeoutSecs)
                print "export_files of", hex_key, "to", path, "took", time.time() - start, 'secs'
                trial += 1

                print "Re-Loading", csvFilename, 'from HDFS'
                start = time.time()
                hex_key = "a2.hex"
                time.sleep(2)
                d = h2i.import_only(path=csvPathname, schema='hdfs', timeoutSecs=1000)
                print h2o.dump_json(d)
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key, timeoutSecs=1000)
                print "hdfs re-parse of", csvPathname, "took", time.time() - start, 'secs'
Exemple #11
0
    def test_exec2_xorsum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        tryList = [
            (ROWS, 1, 'r1', 0, 10, None),
        ]

        for trial in range(10):
            ullResultList = []
            for (rowCount, colCount, hex_key, expectedMin, expectedMax, expected) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                # dynamic range of the data may be useful for estimating error
                maxDelta = expectedMax - expectedMin

                csvFilename = 'syn_real_' + str(rowCount) + 'x' + str(colCount) + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)
                print "Creating random", csvPathname
                (expectedUllSum, expectedFpSum)  = write_syn_dataset(csvPathname, 
                    rowCount, colCount, expectedMin, expectedMax, SEEDPERFILE)
                expectedUllSumAsDouble = h2o_util.unsignedLongLongToDouble(expectedUllSum)
                expectedFpSumAsLongLong = h2o_util.doubleToUnsignedLongLong(expectedFpSum)

                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                    timeoutSecs=3000, retryDelaySecs=2)
                numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
                assert parse_key == hex_key
                assert numCols == colCount
                assert numRows == rowCount

                inspect = h2o_cmd.runInspect(key=hex_key)
                missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect)
                assert len(missingList) == 0

                # looking at the 8 bytes of bits for the h2o doubles
                # xorsum will zero out the sign and exponent
                for execExpr in exprList:
                    for r in range(10):
        
                        if 1==0:
                            execResult = h2o_cmd.runExec(ast=execExpr, timeoutSecs=30)
                            fpResult = execResult['scalar']
                        else:
                            (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='x', timeoutSecs=300)
                            # print dump_json(h2o.n0.frames(key="h"))

                        # (execResult, fpResult) = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey='h', timeoutSecs=300)
                        # print dump_json(h2o.n0.frames(key="r1"))
                        print r, "execResult:", h2o.dump_json(execResult)
                        h2o_cmd.runStoreView()

                        ullResult = h2o_util.doubleToUnsignedLongLong(fpResult)
                        ullResultList.append((ullResult, fpResult))

                        print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)
                        print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                        print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)

                        # allow diff of the lsb..either way
                        # if ullResult!=expectedUllSum and abs((ullResult-expectedUllSum)>3):
                        if ullResult!=expectedUllSum:
                            raise Exception("h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum))
                            print "h2o didn't get the same xorsum as python. 0x%0.16x 0x%0.16x" % \
                                (ullResult, expectedUllSum)

                h2o.check_sandbox_for_errors()

                print "first result was from a sum. others are xorsum"
                print "ullResultList:"
                for ullResult, fpResult in ullResultList:
                    print "%30s" % "ullResult (0.16x):", "0x%0.16x   %s" % (ullResult, fpResult)

                print "%30s" % "expectedUllSum (0.16x):", "0x%0.16x   %s" % (expectedUllSum, expectedUllSumAsDouble)
                print "%30s" % "expectedFpSum (0.16x):", "0x%0.16x   %s" % (expectedFpSumAsLongLong, expectedFpSum)
    def test_0_NA_2enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100,  30, '0', 'cC', 100),
            (100,  30, '0.0', 'cC', 100),
            (100,  30, '0.0000000', 'cC', 100),
            ]

        for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            # column 0 not used here
            # assert len(expected) == 6
            # FIX! add expected and maxDelta?
            co = h2o_cmd.runSummary(key=hex_key, column=0)
            print co.label, co.type, co.missing, co.domain, sum(co.bins)
            coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing,
                co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]

            for k,v in co:
                print k, v

            if DO_REBALANCE:
                print "Rebalancing it to create an artificially large # of chunks"
                rb_key = "rb_%s" % hex_key
                start = time.time()
                print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds'
            else:
                rb_key = hex_key

            print "Now doing to_enum across all columns of %s" % hex_key
            for column_index in range(colCount):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1)
                # print "\nto_enum result:", h2o.dump_json(result)
                co = h2o_cmd.runSummary(key=hex_key, column=column_index+1)

                print co.label, co.type, co.missing, co.domain, sum(co.bins)
                coList = [co.base, len(co.bins), len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins, co.missing,
                    co.ninfs, co.pctiles, co.pinfs, co.precision, co.sigma, co.str_data, co.stride, co.type, co.zeros]

                if co.type != 'Enum':
                    raise Exception("column %s, which has name %s, didn't convert to Enum, is %s" % (column_index, colname, co.type))
                # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1
                if co.missing<=0 or co.missing>rowCount:
                    raise Exception("column %s, which has name %s, somehow got NA cnt wrong after convert to Enum  %s %s" % 
                        (column_index, colname, co.missing, rowCount))

                if co.domain!=1: # NAs don't count?
                    # print "stats:", h2o.dump_json(stats)
                    print "column:", h2o.dump_json(co)
                    raise Exception("column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, co.label, domain))
Exemple #13
0
    def test_hdfs_hdp2_1(self):
        print "\nLoad a list of files from HDFS, parse and do 1 RF tree"
        print "\nYou can try running as hduser/hduser if fail"
        # larger set in my local dir
        # fails because classes aren't integers
        #    "allstate_claim_prediction_train_set.zip",
        csvFilenameAll = [
            # "3G_poker_shuffle"
            ("and-testing.data", 60),
            ### "arcene2_train.both",
            ### "arcene_train.both",
            ### "bestbuy_test.csv",
            ("covtype.data", 60),
            ("covtype4x.shuffle.data", 60),
            # "four_billion_rows.csv",
            ("hhp.unbalanced.012.data.gz", 60),
            ("hhp.unbalanced.data.gz", 60),
            ("leads.csv", 60),
            # ("covtype.169x.data", 1200),
            ("prostate_long_1G.csv", 200),
            ("airlines_all.csv", 1200),
        ]

        # pick 8 randomly!
        if (1 == 0):
            csvFilenameList = random.sample(csvFilenameAll, 8)
        # Alternatively: do the list in order! Note the order is easy to hard
        else:
            csvFilenameList = csvFilenameAll

        # pop open a browser on the cloud
        # h2b.browseTheCloud()

        trial = 0
        print "try importing /tmp2"
        d = h2i.import_only(path="tmp2/*", schema='hdfs', timeoutSecs=1000)
        for (csvFilename, timeoutSecs) in csvFilenameList:
            # creates csvFilename.hex from file in hdfs dir
            print "Loading", csvFilename, 'from HDFS'
            start = time.time()
            hex_key = "a.hex"
            csvPathname = "datasets/" + csvFilename

            # Do a simple typeahead check on the directory
            # typeaheadResult 2: {
            #   "__meta": {
            #     "schema_name": "TypeaheadV2",
            #     "schema_type": "Iced",
            #     "schema_version": 2
            #   },
            #   "limit": 2,
            #   "matches": [
            #     "hdfs://172.16.2.186/datasets/15Mx2.2k.csv",
            #     "hdfs://172.16.2.186/datasets/1Mx2.2k.NAs.csv"
            #   ],
            #   "src": "hdfs://172.16.2.186/datasets/"
            # }

            typeaheadPath = "hdfs://" + h2o.nodes[
                0].hdfs_name_node + "/datasets/"
            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=2)
            print "typeaheadResult 2:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) == 2

            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=0)
            print "typeaheadResult 0:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) > 2

            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=None)
            print "typeaheadResult 0:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) > 2

            typeaheadResult = h2o.nodes[0].typeahead(src=typeaheadPath,
                                                     limit=-1)
            print "typeaheadResult -1:", dump_json(typeaheadResult)
            assert len(typeaheadResult['matches']) > 2

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='hdfs',
                                           hex_key=hex_key,
                                           timeoutSecs=1000)
            print "hdfs parse of", csvPathname, "took", time.time(
            ) - start, 'secs'
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key)
            parse_key = pA.parse_key
            numRows = iA.numRows
            numCols = iA.numCols
            labelList = iA.labelList

            if DO_EXPORT:
                start = time.time()
                print "Saving", csvFilename, 'to HDFS'
                print "Using /tmp2 to avoid the '.' prefixed files in /tmp2 (kills import)"
                print "Unique per-user to avoid permission issues"
                username = getpass.getuser()
                csvPathname = "tmp2/a%s.%s.csv" % (trial, username)
                # reuse the file name to avoid running out of space
                csvPathname = "tmp2/a%s.%s.csv" % ('_h2o_export_files',
                                                   username)

                path = "hdfs://" + h2o.nodes[
                    0].hdfs_name_node + "/" + csvPathname
                h2o.nodes[0].export_files(src_key=hex_key,
                                          path=path,
                                          force=1,
                                          timeoutSecs=timeoutSecs)
                print "export_files of", hex_key, "to", path, "took", time.time(
                ) - start, 'secs'
                trial += 1

                print "Re-Loading", csvFilename, 'from HDFS'
                start = time.time()
                hex_key = "a2.hex"
                time.sleep(2)
                d = h2i.import_only(path=csvPathname,
                                    schema='hdfs',
                                    timeoutSecs=1000)
                print h2o.dump_json(d)
                parseResult = h2i.import_parse(path=csvPathname,
                                               schema='hdfs',
                                               hex_key=hex_key,
                                               timeoutSecs=1000)
                print "hdfs re-parse of", csvPathname, "took", time.time(
                ) - start, 'secs'
    def test_mixed_int_enum_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # this should be a sorted list for comparing to hbrk in the histogram in h2o summary?
        enumList = ["abc", "def", "ghi"]
        # numbers 1 and 2 may not be counted as NAs correctly? what about blank space?
        intList = [0, 1, 2, ""]
        expectedList = ["abc", "def", "ghi"]

        tryList = [
            # not sure about this case
            # some of the cases interpret as ints now (not as enum)
            (ROWS, COLS, "a.hex", enumList[0:1], expectedList[0:1], intList[0:2], False),
            # colname, (min, COLS5th, 50th, 75th, max)
            (ROWS, COLS, "b.hex", enumList[0:2], expectedList[0:2], intList[0:1], True),
            # fails this case
            (ROWS, COLS, "c.hex", enumList[0:1], expectedList[0:1], intList[0:1], True),
            (ROWS, COLS, "d.hex", enumList[0:], expectedList[0:], intList[0:1], True),
            (ROWS, COLS, "e.hex", enumList[0:2], expectedList[0:2], intList[0:2], True),
            # this case seems to fail
            (ROWS, COLS, "f.hex", enumList[0:1], expectedList[0:1], intList[0:2], True),
            # this seems wrong also
            (ROWS, COLS, "g.hex", enumList[0:], expectedList[0:], intList[0:2], True),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k, v in column.iteritems():
                    setattr(self, k, v)  # achieves self.k = v

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, enumChoices, enumExpected, intChoices, resultIsEnum) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = "syn_" + "binary" + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None, csvPathname, returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, enumChoices, intChoices)
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", check_header=0, hex_key=hex_key, timeoutSecs=10, doSummary=False
            )
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            print "numRows:", numRows, "numCols:", numCols
            inspect = h2o_cmd.runInspect(None, hex_key)

            print "\nTrial:", trial, csvFilename

            # this summary only does one column?
            # assert colCount == len(columns), "%s %s" % (colCount, len(columns))

            for i in range(colCount):
                summaryResult = h2o_cmd.runSummary(key=hex_key, column="C" + str(i + 1))
                h2o.verboseprint("summaryResult:", h2o.dump_json(summaryResult))

                # columns = summaryResult['frames'][0]['columns']
                co = Column(summaryResult)
                # how are enums binned. Stride of 1? (what about domain values)
                coList = [
                    co.base,
                    len(co.bins),
                    len(co.data),
                    co.domain,
                    co.label,
                    co.maxs,
                    co.mean,
                    co.mins,
                    co.missing,
                    co.ninfs,
                    co.pctiles,
                    co.pinfs,
                    co.precision,
                    co.sigma,
                    co.str_data,
                    co.stride,
                    co.type,
                    co.zeros,
                ]

                coNameList = [
                    "co.base",
                    "len(co.bins)",
                    "len(co.data)",
                    "co.domain",
                    "co.label",
                    "co.maxs",
                    "co.mean",
                    "co.mins",
                    "co.missing",
                    "co.ninfs",
                    "co.pctiles",
                    "co.pinfs",
                    "co.precision",
                    "co.sigma",
                    "co.str_data",
                    "co.stride",
                    "co.type",
                    "co.zeros",
                ]

                for c, n in zip(coList, coNameList):
                    print n + ":", c

                print "len(co.bins):", len(co.bins)

                print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(co.mean)
                # what is precision. -1?
                # This can go to NaN (string) with big numbers
                # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

                # can be None if col is all NA
                # print "FIX! hacking the co.pctiles because it's short by two"
                # pctiles = [0] + co.pctiles + [0]

                assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (co.zeros, numRows)

                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        co.type,
                        "enum",
                        "Expecting co.type %s to be 'enum' for %s co label  %s" % (co.type, i, co.label),
                    )

                if ENABLE_ASSERTS and resultIsEnum:
                    # not always there
                    cardinality = len(co.domain)
                    self.assertEqual(
                        cardinality,
                        len(enumChoices),
                        msg="trial %s: cardinality %s should be %s" % (trial, cardinality, len(enumChoices)),
                    )

                # assume I create the list above in the same order that h2o will show the order. sorted?
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.bins, enumChoices)

                hcntTotal = sum(co.bins)
                numRowsCreated = rowCount + len(intChoices)
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hcntTotal, numRowsCreated - expectedNaCnt[i])

                self.assertEqual(
                    numRows, numRowsCreated, msg="trial %s: numRows %s should be %s" % (trial, numRows, numRowsCreated)
                )

                nacnt = co.missing
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        nacnt,
                        expectedNaCnt[i],
                        "trial %s: Column %s Expected %s. nacnt %s incorrect" % (trial, i, expectedNaCnt[i], nacnt),
                    )

                # FIX! no checks for the case where it got parsed as int column!
            trial += 1
Exemple #15
0
    def test_0_NA_2enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 30, '0', 'cC', 100),
            (100, 30, '0.0', 'cC', 100),
            (100, 30, '0.0000000', 'cC', 100),
        ]

        for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, zero,
                              SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=30,
                                           doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=rowCount,
                                  expectedNumCols=colCount)
            print pA.numRows, pA.numCols, pA.parse_key

            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=rowCount,
                                    expectedNumCols=colCount,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            # column 0 not used here
            # assert len(expected) == 6
            # FIX! add expected and maxDelta?
            co = h2o_cmd.runSummary(key=hex_key, column=0)
            print co.label, co.type, co.missing, co.domain, sum(co.bins)
            coList = [
                co.base,
                len(co.bins),
                len(co.data), co.domain, co.label, co.maxs, co.mean, co.mins,
                co.missing, co.ninfs, co.pctiles, co.pinfs, co.precision,
                co.sigma, co.str_data, co.stride, co.type, co.zeros
            ]

            for k, v in co:
                print k, v

            if DO_REBALANCE:
                print "Rebalancing it to create an artificially large # of chunks"
                rb_key = "rb_%s" % hex_key
                start = time.time()
                print "Rebalancing %s to %s with %s chunks" % (
                    hex_key, rb_key, REBALANCE_CHUNKS)
                rebalanceResult = h2o.nodes[0].rebalance(
                    source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds'
            else:
                rb_key = hex_key

            print "Now doing to_enum across all columns of %s" % hex_key
            for column_index in range(colCount):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None,
                                              src_key=hex_key,
                                              column_index=column_index + 1)
                # print "\nto_enum result:", h2o.dump_json(result)
                co = h2o_cmd.runSummary(key=hex_key, column=column_index + 1)

                print co.label, co.type, co.missing, co.domain, sum(co.bins)
                coList = [
                    co.base,
                    len(co.bins),
                    len(co.data), co.domain, co.label, co.maxs, co.mean,
                    co.mins, co.missing, co.ninfs, co.pctiles, co.pinfs,
                    co.precision, co.sigma, co.str_data, co.stride, co.type,
                    co.zeros
                ]

                if co.type != 'Enum':
                    raise Exception(
                        "column %s, which has name %s, didn't convert to Enum, is %s"
                        % (column_index, colname, co.type))
                # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1
                if co.missing <= 0 or co.missing > rowCount:
                    raise Exception(
                        "column %s, which has name %s, somehow got NA cnt wrong after convert to Enum  %s %s"
                        % (column_index, colname, co.missing, rowCount))

                if co.domain != 1:  # NAs don't count?
                    # print "stats:", h2o.dump_json(stats)
                    print "column:", h2o.dump_json(co)
                    raise Exception(
                        "column %s, which has name %s, should have cardinality 1, got: %s"
                        % (column_index, co.label, domain))
Exemple #16
0
    def test_mixed_int_enum_many(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        # this should be a sorted list for comparing to hbrk in the histogram in h2o summary?
        enumList = ['abc', 'def', 'ghi']
        # numbers 1 and 2 may not be counted as NAs correctly? what about blank space?
        intList = [0, 1, 2, '']
        expectedList = ['abc', 'def', 'ghi']

        tryList = [
            # not sure about this case
            # some of the cases interpret as ints now (not as enum)
            (ROWS, COLS, 'a.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], False),
            # colname, (min, COLS5th, 50th, 75th, max)
            (ROWS, COLS, 'b.hex', enumList[0:2], expectedList[0:2],
             intList[0:1], True),
            # fails this case
            (ROWS, COLS, 'c.hex', enumList[0:1], expectedList[0:1],
             intList[0:1], True),
            (ROWS, COLS, 'd.hex', enumList[0:], expectedList[0:], intList[0:1],
             True),
            (ROWS, COLS, 'e.hex', enumList[0:2], expectedList[0:2],
             intList[0:2], True),
            # this case seems to fail
            (ROWS, COLS, 'f.hex', enumList[0:1], expectedList[0:1],
             intList[0:2], True),
            # this seems wrong also
            (ROWS, COLS, 'g.hex', enumList[0:], expectedList[0:], intList[0:2],
             True),
        ]

        timeoutSecs = 10
        trial = 1
        n = h2o.nodes[0]
        lenNodes = len(h2o.nodes)

        class Column(object):
            def __init__(self, column):
                assert isinstance(column, dict)
                for k, v in column.iteritems():
                    setattr(self, k, v)  # achieves self.k = v

        x = 0
        timeoutSecs = 60
        for (rowCount, colCount, hex_key, enumChoices, enumExpected,
             intChoices, resultIsEnum) in tryList:
            # max error = half the bin size?

            SEEDPERFILE = random.randint(0, sys.maxint)
            x += 1

            csvFilename = 'syn_' + "binary" + "_" + str(rowCount) + 'x' + str(
                colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            csvPathnameFull = h2i.find_folder_and_filename(None,
                                                           csvPathname,
                                                           returnFullPath=True)

            print "Creating random", csvPathname
            expectedNaCnt = write_syn_dataset(csvPathname, rowCount, colCount,
                                              SEEDPERFILE, enumChoices,
                                              intChoices)
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           check_header=0,
                                           hex_key=hex_key,
                                           timeoutSecs=10,
                                           doSummary=False)
            numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult)
            print "numRows:", numRows, "numCols:", numCols
            inspect = h2o_cmd.runInspect(None, hex_key)

            print "\nTrial:", trial, csvFilename

            # this summary only does one column?
            # assert colCount == len(columns), "%s %s" % (colCount, len(columns))

            for i in range(colCount):
                summaryResult = h2o_cmd.runSummary(key=hex_key,
                                                   column="C" + str(i + 1))
                h2o.verboseprint("summaryResult:",
                                 h2o.dump_json(summaryResult))

                # columns = summaryResult['frames'][0]['columns']
                co = Column(summaryResult)
                # how are enums binned. Stride of 1? (what about domain values)
                coList = [
                    co.base,
                    len(co.bins),
                    len(co.data),
                    co.domain,
                    co.label,
                    co.maxs,
                    co.mean,
                    co.mins,
                    co.missing,
                    co.ninfs,
                    co.pctiles,
                    co.pinfs,
                    co.precision,
                    co.sigma,
                    co.str_data,
                    co.stride,
                    co.type,
                    co.zeros,
                ]

                coNameList = [
                    'co.base',
                    'len(co.bins)',
                    'len(co.data)',
                    'co.domain',
                    'co.label',
                    'co.maxs',
                    'co.mean',
                    'co.mins',
                    'co.missing',
                    'co.ninfs',
                    'co.pctiles',
                    'co.pinfs',
                    'co.precision',
                    'co.sigma',
                    'co.str_data',
                    'co.stride',
                    'co.type',
                    'co.zeros',
                ]

                for c, n in zip(coList, coNameList):
                    print n + ":", c

                print "len(co.bins):", len(co.bins)

                print "co.label:", co.label, "mean (2 places):", h2o_util.twoDecimals(
                    co.mean)
                # what is precision. -1?
                # This can go to NaN (string) with big numbers
                # print "co.label:", co.label, "std dev. (2 places):", h2o_util.twoDecimals(co.sigma)

                # can be None if col is all NA
                # print "FIX! hacking the co.pctiles because it's short by two"
                # pctiles = [0] + co.pctiles + [0]

                assert co.zeros <= numRows, "Can't have more zeros than rows %s %s" % (
                    co.zeros, numRows)

                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        co.type, 'enum',
                        "Expecting co.type %s to be 'enum' for %s co label  %s"
                        % (co.type, i, co.label))

                if ENABLE_ASSERTS and resultIsEnum:
                    # not always there
                    cardinality = len(co.domain)
                    self.assertEqual(
                        cardinality,
                        len(enumChoices),
                        msg="trial %s: cardinality %s should be %s" %
                        (trial, cardinality, len(enumChoices)))

                # assume I create the list above in the same order that h2o will show the order. sorted?
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(co.bins, enumChoices)

                hcntTotal = sum(co.bins)
                numRowsCreated = rowCount + len(intChoices)
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(hcntTotal,
                                     numRowsCreated - expectedNaCnt[i])

                self.assertEqual(numRows,
                                 numRowsCreated,
                                 msg="trial %s: numRows %s should be %s" %
                                 (trial, numRows, numRowsCreated))

                nacnt = co.missing
                if ENABLE_ASSERTS and resultIsEnum:
                    self.assertEqual(
                        nacnt, expectedNaCnt[i],
                        "trial %s: Column %s Expected %s. nacnt %s incorrect" %
                        (trial, i, expectedNaCnt[i], nacnt))

                # FIX! no checks for the case where it got parsed as int column!
            trial += 1