コード例 #1
0
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult  = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)

        csvPathname_test  = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
コード例 #2
0
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult = h2i.import_parse(path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key)
        h2o_cmd.infoFromSummary(rSummary, rows=numRows, cols=numCols)

        csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test,
                                       hex_key=validation_key,
                                       timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
コード例 #3
0
ファイル: test_exec2_runif.py プロジェクト: smarthomekit/h2o
    def test_exec2_runif(self):
        print "h2o syntax is not full R. Doesn't take min/max interval params. assumed 0/1 interval"
        print " just one param, it must be a column or row vector. Result is same length"
        print " R allows a scalar to be param"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            'r0.hex = r.hex[,1]',
            's0.hex = runif(r.hex[,1],-1)',
            's1.hex = runif(r.hex[,1],-1)',
            's2.hex = runif(r.hex[,1],-1)',
            # error. this causes exception
            # 's3.hex = runif(nrow(r.hex), -1)',
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            (resultExec, result) = h2e.exec_expr(
                execExpr=execExpr, timeoutSecs=30)  # unneeded but interesting
            results.append(result)
            print "exec end on ", "operators", 'took', time.time(
            ) - start, 'seconds'
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(resultExec)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1 == 0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [
                0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567,
                1859.0, 1859.0
            ])
コード例 #4
0
ファイル: test_many_fp_formats.py プロジェクト: 100star/h2o
    def test_many_fp_formats(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (100, 100, 'cB', 180),
            (100000, 10, 'cA', 180),
            # (100, 900, 'cC', 30),
            # (100, 500, 'cD', 30),
            # (100, 100, 'cE', 30),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            NUM_CASES = h2o_util.fp_format()
            print "Will do %s" % NUM_CASES
            for sel in range(NUM_CASES): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                hex_key = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, doSummary=False)
                h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100)

                print "Parse result['destination_key']:", hex_key
                inspect = h2o_cmd.runInspect(None, hex_key)
                print "Removing", hex_key
                h2o.nodes[0].remove_key(hex_key)
コード例 #5
0
    def test_many_fp_formats(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (100, 100, 'cB', 180),
            (100000, 10, 'cA', 180),
            # (100, 900, 'cC', 30),
            # (100, 500, 'cD', 30),
            # (100, 100, 'cE', 30),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            NUM_CASES = h2o_util.fp_format()
            print "Will do %s" % NUM_CASES
            for sel in range(NUM_CASES): # len(caseList)
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

                hex_key = hex_key + "_" + str(sel)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, doSummary=False)
                h2o_cmd.runSummary(key=parseResult['destination_key'], max_qbins=100)

                print "Parse result['destination_key']:", hex_key
                inspect = h2o_cmd.runInspect(None, hex_key)
                print "Removing", hex_key
                h2o.nodes[0].remove_key(hex_key)
コード例 #6
0
    def test_exec2_runif(self):
        print "in h2o-dev, params are column, min, max, seed"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            # params for h2o-dev runif are: column, min, max, seed
            AssignObj('r0.hex', KeyIndexed('r.hex', col=0)),
            AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0),
                                    1)),
            AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1),
                                    -1)),
            AssignObj('s2.hex',
                      Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)),
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            results.append(result)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1 == 0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [
                0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567,
                1859.0, 1859.0
            ])
コード例 #7
0
    def test_create_frame_rand1(self):
        h2o.beta_features = True
        # default
        params = {'rows': 1, 'cols': 1}
        for trial in range(20):
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None

            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0

            kwargs = params.copy()

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path='poker/poker1000',
                                           hex_key='temp1000.hex',
                                           schema='put',
                                           timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex',
                                                 timeoutSecs=timeoutSecs,
                                                 **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex',
                                          csvPathname=csvPathname,
                                          timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            h2o_cmd.runSummary(key='temp1000.hex', timeoutSecs=300)
            print h2o.dump_json(cfResult)

            print "Trial #", trial, "completed"
コード例 #8
0
ファイル: test_exec2_runif.py プロジェクト: BersaKAIN/h2o
    def test_exec2_runif(self):
        h2o.beta_features = True
        print "h2o syntax is not full R. Doesn't take min/max interval params. assumed 0/1 interval"
        print " just one param, it must be a column or row vector. Result is same length"
        print " R allows a scalar to be param"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            'r0.hex = r.hex[,1]',
            's0.hex = runif(r.hex[,1],-1)',
            's1.hex = runif(r.hex[,1],-1)',
            's2.hex = runif(r.hex[,1],-1)',
            # error. this causes exception
            # 's3.hex = runif(nrow(r.hex), -1)',
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            (resultExec, result) = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30) # unneeded but interesting 
            results.append(result)
            print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(resultExec)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1==0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)
        

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
コード例 #9
0
ファイル: test_put_parse4.py プロジェクト: BhaskarPros/h2o
 def test_put_parse4(self):
     timeoutSecs = 10
     trial = 1
     n = h2o.nodes[0]
     for x in xrange (2):
         print 'Trial:', trial
         csvPathname = 'iris/iris_wheader.csv.gz'
         hex_key = "iris" + "_" + str(x) + ".hex"
         parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=hex_key, doSummary=False, schema='put')
         h2o_cmd.runSummary(key=hex_key)
         trial += 1
コード例 #10
0
ファイル: test_put_parse4.py プロジェクト: NidhiMehta/h2o
 def test_put_parse4(self):
     timeoutSecs = 10
     trial = 1
     n = h2o.nodes[0]
     for x in xrange (2):
         print 'Trial:', trial
         # csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
         csvPathname = h2o.find_file('smalldata/iris/iris_wheader.csv.gz')
         key2 = "iris" + "_" + str(x) + ".hex"
         parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, doSummary=False)
         h2o_cmd.runSummary(key=key2, doPrint=True)
         trial += 1
コード例 #11
0
    def test_create_frame_rand1(self):
        h2o.beta_features = True
        # default
        params = {
            'rows': 1, 
            'cols': 1
        }
        for trial in range(20):
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strick checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:   
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None
                
            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0


            kwargs = params.copy()

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            h2o_cmd.runSummary(key='temp1000.hex')
            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
コード例 #12
0
    def test_rf_airlines_2013_fvec(self):
        h2o.beta_features = True
        h2b.browseTheCloud()

        csvFilename = 'year2013.csv'
        hex_key = 'year2013.hex'
        importFolderPath = 'airlines'
        csvPathname = importFolderPath + "/" + csvFilename
        start = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='local',
                                       hex_key=hex_key,
                                       timeoutSecs=900,
                                       doSummary=False)
        parse_time = time.time() - start
        print "parse took {0} sec".format(parse_time)
        start = time.time()

        start = time.time()
        # noise=['JStack','cpu','disk'])
        h2o_cmd.runSummary(key=hex_key, timeoutSecs=200)
        elapsed = time.time() - start
        print "summary took {0} sec".format(elapsed)

        trees = 10
        paramsTrainRF = {
            'ntrees': trees,
            'max_depth': 20,
            'nbins': 200,
            'ignored_cols_by_name':
            'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
            'timeoutSecs': 14800,
        }
        kwargs = paramsTrainRF.copy()
        start = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        (classification_error, classErrorPctList,
         totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

        l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
            trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}'.format(
            len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename,
            elapsed, trees, classification_error, classErrorPctList,
            totalScores)
        print "\n" + l
        h2o.cloudPerfH2O.message(l)

        # just to make sure we test this
        h2i.delete_keys_at_all_nodes(pattern=hex_key)
コード例 #13
0
    def test_exec2_runif(self):
        print "in h2o-dev, params are column, min, max, seed"
        bucket = 'home-0xdiag-datasets'
        csvPathname = 'standard/covtype.data'
        hexKey = 'r.hex'
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey)
        # work up to the failing case incrementally
        execExprList = [
            # hack to make them keys? (not really needed but interesting)
            # params for h2o-dev runif are: column, min, max, seed 
            AssignObj('r0.hex', KeyIndexed('r.hex',col=0) ),
            AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1) ),
            AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)  ),
            AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1) ),
        ]

        results = []
        for execExpr in execExprList:
            start = time.time()
            result = execExpr.do(timeoutSecs=30)
            results.append(result)
            execResult = execExpr.execResult
            print "exec took", time.time() - start, "seconds"
            print "exec result:", result
            print "exec result (full):", h2o.dump_json(execResult)
            h2o.check_sandbox_for_errors()

        rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0')
        # h2o_cmd.infoFromSummary(rSummary)

        sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0')
        # h2o_cmd.infoFromSummary(sSummary)

        # since there are no NAs in covtype, r.hex and s.hex should be identical?
        if 1==0:
            print "Comparing summary of r.hex to summary of s.hex"
            df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True)
            # time can be different
            print "df.difference:", h2o.dump_json(df.difference)
            self.assertLess(len(df.difference), 2)
        

            print "results from the individual exec expresssions (ignore last which was an apply)"
            print "results:", results
            self.assertEqual(results, [0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0])
コード例 #14
0
 def test_put_parse4(self):
     timeoutSecs = 10
     trial = 1
     n = h2o.nodes[0]
     for x in xrange(2):
         print 'Trial:', trial
         # csvPathname = h2o.find_file("smalldata/hhp_107_01.data.gz")
         csvPathname = h2o.find_file('smalldata/iris/iris_wheader.csv.gz')
         key2 = "iris" + "_" + str(x) + ".hex"
         parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                      key2=key2,
                                      doSummary=False)
         h2o_cmd.runSummary(key=key2, doPrint=True)
         trial += 1
コード例 #15
0
 def test_put_parse4(self):
     timeoutSecs = 10
     trial = 1
     n = h2o.nodes[0]
     for x in xrange(2):
         print 'Trial:', trial
         csvPathname = 'iris/iris_wheader.csv.gz'
         hex_key = "iris" + "_" + str(x) + ".hex"
         parseResult = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname,
                                        hex_key=hex_key,
                                        doSummary=False,
                                        schema='put')
         h2o_cmd.runSummary(key=hex_key)
         trial += 1
コード例 #16
0
    def test_parse_covtype_2(self):

        tryList = [
            ('covtype.data', 1, 30),
            # ('covtype20x.data', 20, 120),
        ]

        for (csvFilename, multiplyExpected, timeoutSecs) in tryList:

            for trial in range(16,24):
                # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
                importFolderPath = "standard"
                hex_key = 'covtype.hex'
                csvPathname = importFolderPath + "/" + csvFilename
                chunk_size = 2**trial
                print "Trial %s. Trying chunk_size %s (power of 2)" % (trial, chunk_size)

                parseResult  = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', 
                    timeoutSecs=timeoutSecs, hex_key=hex_key,
                    chunk_size=chunk_size, doSummary=False)
                pA = h2o_cmd.ParseObj(parseResult)
                iA = h2o_cmd.InspectObj(pA.parse_key)
                print iA.missingList, iA.labelList, iA.numRows, iA.numCols

                for i in range(1):
                    co = h2o_cmd.runSummary(key=hex_key, column=i)

                k = parseResult['frames'][0]['frame_id']['name']
                # print "parseResult:", dump_json(parseResult)
                a_node = h2o.nodes[0]
                frames_result = a_node.frames(key=k, row_count=5)
                # print "frames_result from the first parseResult key", dump_json(frames_result)
                
                parseKeyIndexedCheck(frames_result, multiplyExpected)
コード例 #17
0
    def test_parse_nfs(self):
        print "run as user 0xcustomer on machine with nfs /mnt/0xcustomer-datasets/c1"
        tryList = [
            ('iris2.csv', 'iris2.hex', 1, 30),
        ]

        for (csvFilename, hex_key, multiplyExpected, timeoutSecs) in tryList:
            importFolderPath = "/mnt/0xcustomer-datasets/c1"
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult  = h2i.import_parse(path=csvPathname, schema='local',
                timeoutSecs=timeoutSecs, hex_key=hex_key, chunk_size=4194304/2, doSummary=False)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=150*multiplyExpected, 
                expectedNumCols=5, expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(0):
                print "Summary on column", i
                co = h2o_cmd.runSummary(key=hex_key, column=i)

            k = parseResult['frames'][0]['frame_id']['name']
            frames_result = h2o.nodes[0].frames(key=k, row_count=5)
            # print "frames_result from the first parseResult key", dump_json(frames_result)
            parseKeyIndexedCheck(frames_result, multiplyExpected)
コード例 #18
0
    def test_from_import_fvec(self):
        csvFilenameAll = [
            ("covtype.data", 500),
            # ("covtype20x.data", 1000),
            ]

        for (csvFilename, timeoutSecs) in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir 
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True)
            h2o_cmd.infoFromInspect(inspect, parseResult['destination_key'])

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
            # h2o_cmd.infoFromSummary(summaryResult)

            trees = 2
            start = time.time()
            rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
                trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, 
                    trees, classification_error, classErrorPctList, totalScores)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
コード例 #19
0
ファイル: test_c7_rel.py プロジェクト: brennane/h2o
    def test_c7_rel(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=True)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "x:", x

        kwargs = {
            'x': x,
            'y': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600

        if DO_GLM:
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
コード例 #20
0
    def test_NOPASS_create_frame_fail(self):
        h2o.beta_features = True

        for trial in range(20):
            kwargs = {'integer_range': None, 'missing_fraction': 0.1, 'cols': 10, 'response_factors': 1, 'seed': 1234, 'randomize': 1, 'categorical_fraction': 0, 'rows': 1, 'factors': 0, 'real_range': 0, 'value': None, 'integer_fraction': 0}

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10)
            h2o_cmd.infoFromSummary(rSummary)

            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
コード例 #21
0
ファイル: genJson.py プロジェクト: 100star/h2o
def getSummaries():
    with open('./smalldata.csv', 'rb') as f:
        for line in f:
            PATHS = []
            NAMES = []
            NUMCOLS = 0
            NUMROWS = 0
            TYPES = []
            RANGES = []
            IGNORED = 'NA'
            TARGET = 'NA'

            DATANAME, uploadPath, importPath, importHDFS, fullPath = line.strip("\n").split(',')
            PATHS = [uploadPath, importPath, importHDFS]
            
            bucket = 'smalldata'
            path = '/'.join(importPath.split('/')[2:]).strip('"')
            parseResult = h2i.import_parse(bucket=bucket, path = path, schema='local', doSummary = False)
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'])
            columns = summary['summary']['columns']
            
            NUMCOLS = len(columns)
            NUMROWS = columns[0]['N']
            for col in columns:
                NAMES.append( '\"' + col['name'] + '\"')
                TYPES.append('\"' + col['type'] + '\"')
                tup = '(' + '"' + str(min(col['min'])) + '"' + ',' + '"' + str(max(col['max'])) + '"' + ')' if col['type'] == 'number' else '("NA", "NA")'
                RANGES += [tup] 
            
            toJson(DATANAME, PATHS, NAMES, NUMCOLS, NUMROWS, TYPES, RANGES, IGNORED = "NA", TARGET = "NA")
コード例 #22
0
    def test_NOPASS_exec2_empty_result(self):
        bucket = 'smalldata'
        csvPathname = 'iris/iris2.csv'
        hexKey = 'i.hex'
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=csvPathname,
                                       schema='put',
                                       hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=10)

        start = time.time()
        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0],
                          execExpr,
                          resultKey=None,
                          timeoutSecs=10)
            rSummary = h2o_cmd.runSummary(key="a")
            h2o_cmd.infoFromSummary(rSummary)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", 'took', time.time(
        ) - start, 'seconds'
コード例 #23
0
    def test_parse_covtype_2(self):

        tryList = [
            ('covtype.data', 1, 30),
            ('covtype20x.data', 20, 120),
        ]

        for (csvFilename, multiplyExpected, timeoutSecs) in tryList:

            # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv"))
            importFolderPath = "standard"
            hex_key = 'covtype.hex'
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult  = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', 
                timeoutSecs=timeoutSecs, hex_key=hex_key,
                chunk_size=4194304*2, doSummary=False)
            pA = h2o_cmd.ParseObj(parseResult)

            iA = h2o_cmd.InspectObj(pA.parse_key)
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(1):
                print "Summary on column", i
                co = h2o_cmd.runSummary(key=hex_key, column=i)

            k = parseResult['frames'][0]['key']['name']
            # print "parseResult:", dump_json(parseResult)
            a_node = h2o.nodes[0]
            frames_result = a_node.frames(key=k, row_count=5)
            # print "frames_result from the first parseResult key", dump_json(frames_result)
            
            parseKeyIndexedCheck(frames_result, multiplyExpected)
コード例 #24
0
    def test_parse_nfs(self):
        print "run as user 0xcustomer on machine with nfs /mnt/0xcustomer-datasets/c1"
        tryList = [
            ('iris2.csv', 'iris2.hex', 1, 30),
        ]

        for (csvFilename, hex_key, multiplyExpected, timeoutSecs) in tryList:
            importFolderPath = "/mnt/0xcustomer-datasets/c1"
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='local',
                                           timeoutSecs=timeoutSecs,
                                           hex_key=hex_key,
                                           chunk_size=4194304 / 2,
                                           doSummary=False)

            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=150 * multiplyExpected,
                                    expectedNumCols=5,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(0):
                print "Summary on column", i
                co = h2o_cmd.runSummary(key=hex_key, column=i)

            k = parseResult['frames'][0]['frame_id']['name']
            frames_result = h2o.nodes[0].frames(key=k, row_count=5)
            # print "frames_result from the first parseResult key", dump_json(frames_result)
            parseKeyIndexedCheck(frames_result, multiplyExpected)
コード例 #25
0
ファイル: test_storeview_import.py プロジェクト: 100star/h2o
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs)
            print h2o.dump_json(importResult)
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30)
            # print h2o.dump_json(storeViewResult)

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #26
0
        def do_summary_and_inspect():
            # SUMMARY******************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            coltypeList = h2o_cmd.infoFromSummary(summaryResult)

            # INSPECT******************************************
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Now check both inspect and summary
            if csvFilename=='covtype.binary.svm':
                for k in range(55):
                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0))
                    stype = inspect['cols'][k]['type']
                    print k, stype
                    self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int'))

                # summary may report type differently than inspect..check it too!
                # we could check na here too
                for i,c in enumerate(coltypeList):
                    print "column index: %s  column type: %s" % (i, c)
                    # inspect says 'int?"
                    assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
コード例 #27
0
    def test_summary_with_x_libsvm (self):
        h2o.beta_features = True
        print "Empty rows except for the last, with all zeros for class. Single col at max"
        h2b.browseTheCloud()
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 100, 'cA', 300),
            (100000, 100, 'cB', 300),
            (100, 1000, 'cC', 300),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
                SEEDPERFILE = random.randint(0, sys.maxint)
                csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename

                print "Creating random", csvPathname
                # dict of col sums for comparison to exec col sums below
                (colNumberMax, synColSumDict) = write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, 
                    doSummary=False)
                print "Parse result['destination_key']:", parseResult['destination_key']
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=colNumberMax+1, 
                    timeoutSecs=timeoutSecs)
                numCols = inspect['numCols']
                numRows = inspect['numRows']

                self.assertEqual(colNumberMax+1, numCols, 
                    msg="generated %s cols (including output).  parsed to %s cols" % (colNumberMax+1, numCols))
                self.assertEqual(rowCount, numRows, 
                    msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))

                for x in range(numCols):
                    print "Doing summary with x=%s" % x
                    summaryResult = h2o_cmd.runSummary(key=hex_key, cols=x, timeoutSecs=timeoutSecs)
                    # skip the infoFromSummary check

                    colName = "C" + str(x+1)
                    print "Doing summary with col name x=%s" % colName
                    summaryResult = h2o_cmd.runSummary(key=hex_key, cols=colName, timeoutSecs=timeoutSecs)

                # do a final one with all columns for the current check below
                # FIX! we should update the check to check each individual summary result
                print "Doing and checking summary with no x=%s" % x
                summaryResult = h2o_cmd.runSummary(key=hex_key, max_ncols=colNumberMax+1, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromSummary(summaryResult, noPrint=True)
コード例 #28
0
ファイル: test_0_NA_2enum.py プロジェクト: MadhaviGmv/h2o
    def test_0_NA_2enum(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100,  30, '0', 'cC', 100),
            (100,  30, '0.0', 'cC', 100),
            (100,  30, '0.0000000', 'cC', 100),
            ]

        for (rowCount, colCount, zero, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, zero, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvFilename


            if DO_REBALANCE:
                print "Rebalancing it to create an artificially large # of chunks"
                rb_key = "rb_%s" % hex_key
                start = time.time()
                print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds'
            else:
                rb_key = hex_key

            print "Now doing to_enum across all columns of %s" % hex_key
            for column_index in range(colCount):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1)
                # print "\nto_enum result:", h2o.dump_json(result)
                summaryResult = h2o_cmd.runSummary(key=hex_key)
                # check that it at least is an enum column now, with no na's
                # just look at the column we touched
                column = summaryResult['summaries'][column_index]
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                stats = column['stats']
                stattype = stats['type']
                cardinality = stats['cardinality']
                if stattype != 'Enum':
                    raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" % (column_index, colname, stattype, coltype))
                # I'm generating NA's ..so it should be > 0. .but it could be zero . I guess i have enough rows to get at least 1
                if nacnt<=0 or nacnt>rowCount:
                    raise Exception("column %s, which has name %s, somehow got NA cnt wrong after convert to Enum  %s %s" % 
                        (column_index, colname, nacnt, rowCount))
                if cardinality!=1: # NAs don't count?
                    # print "stats:", h2o.dump_json(stats)
                    print "column:", h2o.dump_json(column)
                    raise Exception("column %s, which has name %s, should have cardinality 1, got: %s" % (column_index, colname, cardinality))
                h2o_cmd.infoFromSummary(summaryResult)
コード例 #29
0
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10

            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE,
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)
コード例 #30
0
    def test_frame_split_balance(self):
        h2o.beta_features = True

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       schema='local',
                                       timeoutSecs=20)

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        inspect = h2o_cmd.runInspect(key=splitMe)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        for s in range(20):
            inspect = h2o_cmd.runInspect(key=splitMe)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5)
            split0_key = fs['split_keys'][0]
            split1_key = fs['split_keys'][1]
            split0_rows = fs['split_rows'][0]
            split1_rows = fs['split_rows'][1]
            split0_ratio = fs['split_ratios'][0]
            split1_ratio = fs['split_ratios'][1]
            print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split1_key
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
            if split0_rows <= 2:
                break

            print "Now do some rebalancing on the split frames"
            for trial in range(2):
                rb_key = "rb_%s_%s" % (trial, splitMe)
                SEEDPERFILE = random.randint(0, sys.maxint)
                randChunks = random.randint(1, 100)
                start = time.time()
                print "Trial %s: Rebalancing %s to %s with %s chunks" % (
                    trial, splitMe, rb_key, randChunks)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key,
                                                         after=rb_key,
                                                         seed=SEEDPERFILE,
                                                         chunks=randChunks)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\
                h2o_cmd.runSummary(key=rb_key)
                print "\nInspecting the original parsed result"
                inspect = h2o_cmd.runInspect(key=hex_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
                print "\nInspecting the rebalanced result with %s forced chunks" % randChunks
                inspect = h2o_cmd.runInspect(key=rb_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
コード例 #31
0
    def test_c7_rel(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        num_rows = inspect['num_rows']
        num_cols = inspect['num_cols']
        print "\n" + csvFilename, "    num_rows:", "{:,}".format(num_rows), "    num_cols:", "{:,}".format(num_cols)

        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=num_cols, numRows=num_rows, max_column_display=2500)
        # it's in runSummary!
        # h2o_cmd.infoFromSummary(summaryResult, noPrint=False, numCols=num_cols, numRows=num_rows)

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "x:", x

        kwargs = {
            'x': x,
            'y': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600

        if DO_GLM:
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
コード例 #32
0
ファイル: test_create_frame_rand1.py プロジェクト: germc/h2o
    def test_create_frame_rand1(self):
        h2o.beta_features = True
        # default
        params = {
            'rows': 1, 
            'cols': 1
        }
        for trial in range(10):
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', 0)
            c = params.get('categorical_fraction', 0)
            r = params.get('randomize', 0)
            v = params.get('value', None)
            if r:
                if v is not None:
                    # if these are None, they are treated as >0 (default > 0?)
                    params['integer_fraction'] = 0
                    params['categorical_fraction'] = 0
                elif (i and c) and (i + c) >= 1.0:
                    params['integer_fraction'] = i
                    params['categorical_fraction'] = 1.0 - i
            else:
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0
                params['value'] = None


            kwargs = params.copy()

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            h2o_cmd.runSummary(key='temp1000.hex')
            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
コード例 #33
0
ファイル: h2o_kmeans.py プロジェクト: jmcclell/h2o
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        model_key = kmeans["model"]["_selfKey"]
        # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame

        # can't use inspect on a model key? now?
        kmeansResult = kmeans
        model = kmeansResult["model"]
        centers = model["clusters"]
        error = model["error"]
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # have to figure out how to get this with fvec
        sqr_error_per_cluster = [0 for h in hcnt]

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", centers[i]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
コード例 #34
0
ファイル: h2o_kmeans.py プロジェクト: Jfeng3/h2o
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        # can't use inspect on a model key? now?
        model = kmeans["model"]
        model_key = model["_key"]
        centers = model["centers"]
        cluster_variances = model["within_cluster_variances"]
        error = model["total_within_SS"]
        kmeansResult = kmeans
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult))
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # FIX! does the cluster order/naming match, compared to cluster variances
        sqr_error_per_cluster = cluster_variances

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
コード例 #35
0
    def test_rf_airlines_2013_fvec(self):
        h2o.beta_features = True
        h2b.browseTheCloud()


        csvFilename = 'year2013.csv'
        hex_key = 'year2013.hex'
        importFolderPath = 'airlines'
        csvPathname = importFolderPath + "/" + csvFilename
        start      = time.time()
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', 
            path=csvPathname, schema='local', hex_key=hex_key, timeoutSecs=900, doSummary=False)
        parse_time = time.time() - start 
        print "parse took {0} sec".format(parse_time)
        start      = time.time()
        
        start = time.time()
        # noise=['JStack','cpu','disk'])
        h2o_cmd.runSummary(key=hex_key, timeoutSecs=200)
        elapsed = time.time() - start 
        print "summary took {0} sec".format(elapsed)

        trees = 10
        paramsTrainRF = { 
            'ntrees': trees, 
            'max_depth': 20,
            'nbins': 200,
            'ignored_cols_by_name': 'CRSDepTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,IsArrDelayed',
            'timeoutSecs': 14800,
            }
        kwargs   = paramsTrainRF.copy()
        start      = time.time()
        rfView = h2o_cmd.runRF(parseResult=parseResult, **kwargs)
        elapsed = time.time() - start
        (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)

        l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
            trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
            len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed,
                trees, classification_error, classErrorPctList, totalScores)
        print "\n"+l
        h2o.cloudPerfH2O.message(l)

        # just to make sure we test this
        h2i.delete_keys_at_all_nodes(pattern=hex_key)
コード例 #36
0
    def test_parse_summary_manyfiles_s3_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [("manyfiles-nflx-gz", 800)]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            # change to 50 files
            csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs
            )

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path=csvPathname,
                schema="s3",
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                retryDelaySecs=10,
                pollTimeoutSecs=120,
            )
            elapsed = time.time() - start
            print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #37
0
ファイル: test_storeview_import.py プロジェクト: yangls06/h2o
    def test_storeview_import(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #38
0
    def test_parse_summary_airline_s3(self):
        h2o.beta_features = True
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #39
0
    def test_parse_65k_cols_01_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 63000, 'cH', 100),
            (10, 65000, 'cH', 100),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse", parseResult['destination_key'], "took", time.time() - start, "seconds"
            print "Summary should work with 65k"
            start = time.time()
            h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=300)
            print "Summary", parseResult['destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))
コード例 #40
0
    def test_parse_summary_airline_s3(self):
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #41
0
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip", 300),  # 110.9MB
            ("train_set.zip", 600),  # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets',
                                                        path="allstate",
                                                        schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets',
                                           path=csvPathname,
                                           schema='s3',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
コード例 #42
0
    def test_parse_65k_cols_01_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 63000, 'cH', 100),
            (10, 65000, 'cH', 100),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse", parseResult['destination_key'], "took", time.time() - start, "seconds"
            print "Summary should work with 65k"
            start = time.time()
            h2o_cmd.runSummary(key=parseResult['destination_key'], timeoutSecs=300)
            print "Summary", parseResult['destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))
コード例 #43
0
    def test_parse_summary_manyfiles_1_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_1.dat.gz"
            (importResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', timeoutSecs=timeoutSecs)
            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            # pass numRows, so we know when na cnt means row is all na's
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360, 
                numCols=numCols, numRows=numRows)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #44
0
ファイル: test_libsvm.py プロジェクト: Brontai/h2o
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10
        
            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE, path=csvPathname, hex_key=hex_key, timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': None, #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
コード例 #45
0
    def test_frame_split_balance(self):
        h2o.beta_features = True

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, hex_key=hex_key, schema='local', timeoutSecs=20)

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        inspect = h2o_cmd.runInspect(key=splitMe)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        for s in range(20):
            inspect = h2o_cmd.runInspect(key=splitMe)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5)
            split0_key = fs['split_keys'][0]
            split1_key = fs['split_keys'][1]
            split0_rows = fs['split_rows'][0]
            split1_rows = fs['split_rows'][1]
            split0_ratio = fs['split_ratios'][0]
            split1_ratio = fs['split_ratios'][1]
            print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split1_key
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
            if split0_rows <= 2:
                break

            print "Now do some rebalancing on the split frames"
            for trial in range(2):
                rb_key = "rb_%s_%s" % (trial, splitMe)
                SEEDPERFILE = random.randint(0, sys.maxint)
                randChunks = random.randint(1, 100)
                start = time.time()
                print "Trial %s: Rebalancing %s to %s with %s chunks" % (trial, splitMe, rb_key, randChunks)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, seed=SEEDPERFILE, chunks=randChunks)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\
                h2o_cmd.runSummary(key=rb_key)
                print "\nInspecting the original parsed result"
                inspect = h2o_cmd.runInspect(key=hex_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
                print "\nInspecting the rebalanced result with %s forced chunks" % randChunks
                inspect = h2o_cmd.runInspect(key=rb_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
コード例 #46
0
ファイル: test_insert_na.py プロジェクト: smarthomekit/h2o
    def test_insert_na(self):
        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       schema='local',
                                       timeoutSecs=20)

        print "Just insert some NAs and see what happens"
        inspect = h2o_cmd.runInspect(key=hex_key)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        missing_fraction = 0.1

        # every iteration, we add 0.1 more from the unmarked to the marked (missing)

        expectedMissing = missing_fraction * origNumRows  # per col
        for trial in range(2):

            fs = h2o.nodes[0].insert_missing_values(
                key=hex_key, missing_fraction=missing_fraction, seed=SEED)
            print "fs", h2o.dump_json(fs)
            inspect = h2o_cmd.runInspect(key=hex_key)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            expected = .1 * numRows

            # Each column should get .10 random NAs per iteration. Within 10%?
            missingValuesList = h2o_cmd.infoFromInspect(inspect)
            print "missingValuesList", missingValuesList
            for mv in missingValuesList:
                # h2o_util.assertApproxEqual(mv, expectedMissing, tol=0.01, msg='mv %s is not approx. expected %s' % (mv, expectedMissing))
                self.assertAlmostEqual(mv,
                                       expectedMissing,
                                       delta=0.1 * mv,
                                       msg='mv %s is not approx. expected %s' %
                                       (mv, expectedMissing))

            self.assertEqual(origNumRows, numRows)
            self.assertEqual(origNumCols, numCols)

            summaryResult = h2o_cmd.runSummary(key=hex_key)
            # h2o_cmd.infoFromSummary(summaryResult)

            print "trial", trial
            print "expectedMissing:", expectedMissing
            print "I don't understand why the values don't increase every iteration. It seems to stay stuck with the first effect"
コード例 #47
0
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #48
0
    def test_rebalance_int2enum(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100000,  30, 'cC', 100),
            ]

        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=20)
            hex_key=parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=hex_key)
            print "\n" + csvFilename


            print "Rebalancing it to create an artificially large # of chunks"
            rb_key = "rb_%s" % (hex_key)
            start = time.time()
            print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS)
            rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS)
            elapsed = time.time() - start
            print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\

            print "Now doing to_enum across all columns of %s" % hex_key
            for column_index in range(colCount):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None, src_key=hex_key, column_index=column_index+1)
                # print "\nto_enum result:", h2o.dump_json(result)
                summaryResult = h2o_cmd.runSummary(key=hex_key)
                # check that it at least is an enum column now, with no na's
                # just look at the column we touched
                column = summaryResult['summaries'][column_index]
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                stats = column['stats']
                stattype = stats['type']
                cardinality = stats['cardinality']
                if stattype != 'Enum':
                    raise Exception("column %s, which has name %s, didn't convert to Enum, is %s %s" (column_index, colname, stattype, coltype))
                if nacnt!=0:
                    raise Exception("column %s, which has name %s, somehow got NAs after convert to Enum  %s" (column_index, colname, nacnt))
                if cardinality!=4:
                    raise Exception("column %s, which has name %s,  should have cardinality 4, got: %s" (column_index, colname, cardinality))
                h2o_cmd.infoFromSummary(summaryResult)
コード例 #49
0
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='h2o-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
コード例 #50
0
    def test_NOPASS_exec2_empty_result(self):
        bucket = "smalldata"
        csvPathname = "iris/iris2.csv"
        hexKey = "i.hex"
        parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema="put", hex_key=hexKey)

        for resultKey, execExpr in initList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

        start = time.time()
        for execExpr in exprList:
            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)
            rSummary = h2o_cmd.runSummary(key="a")
            h2o_cmd.infoFromSummary(rSummary)

        h2o.check_sandbox_for_errors()
        print "exec end on ", "operators", "took", time.time() - start, "seconds"
コード例 #51
0
    def test_parse_mnist_rebalance(self):
        importFolderPath = "mnist"
        csvFilelist = [
            ("mnist_training.csv.gz", 600),
            ("mnist_training.csv.gz", 600),
            ("mnist_testing.csv.gz", 600),
            ("mnist_testing.csv.gz", 600),
        ]

        trial = 0
        allDelta = []
        for (csvFilename, timeoutSecs) in csvFilelist:
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=importFolderPath + "/" +
                                           csvFilename,
                                           hex_key=hex_key,
                                           retryDelaySecs=1,
                                           timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            print "\n#******************************************************************************"
            for trial in range(1):
                rb_key = "rb_%s_%s" % (trial, hex_key)
                SEEDPERFILE = random.randint(0, sys.maxint)
                randChunks = random.randint(1, 100)
                start = time.time()
                print "Trial %s: Rebalancing %s to %s with %s chunks" % (
                    trial, hex_key, rb_key, randChunks)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key,
                                                         after=rb_key,
                                                         seed=SEEDPERFILE,
                                                         chunks=randChunks)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\
                h2o_cmd.runSummary(key=rb_key, timeoutSecs=timeoutSecs)
                print "\nInspecting the original parsed result"
                inspect = h2o_cmd.runInspect(key=hex_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
                print "\nInspecting the rebalanced result with %s forced chunks" % randChunks
                inspect = h2o_cmd.runInspect(key=rb_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
コード例 #52
0
    def test_NOPASS_create_frame_fail(self):
        h2o.beta_features = True

        for trial in range(20):
            kwargs = {
                'integer_range': None,
                'missing_fraction': 0.1,
                'cols': 10,
                'response_factors': 1,
                'seed': 1234,
                'randomize': 1,
                'categorical_fraction': 0,
                'rows': 1,
                'factors': 0,
                'real_range': 0,
                'value': None,
                'integer_fraction': 0
            }

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path='poker/poker1000',
                                           hex_key='temp1000.hex',
                                           schema='put',
                                           timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex',
                                                 timeoutSecs=timeoutSecs,
                                                 **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex',
                                          csvPathname=csvPathname,
                                          timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            rSummary = h2o_cmd.runSummary(key='temp1000.hex', cols=10)
            h2o_cmd.infoFromSummary(rSummary)

            print h2o.dump_json(cfResult)

            print "Trial #", trial, "completed"
コード例 #53
0
    def test_parse_covtype(self):

        tryList = [
            ('covtype.data', 1, 30),
            ('covtype20x.data', 20, 120),
        ]

        for (csvFilename, multiplyExpected, timeoutSecs) in tryList:
            # h2o-dev doesn't take ../.. type paths? make find_file return absolute pathj
            a_node = h2o.nodes[0]

            importFolderPath = os.path.expanduser(
                "~/home-0xdiag-datasets/standard")
            csvPathname = importFolderPath + "/" + csvFilename
            importResult = a_node.import_files(path=csvPathname)

            # print "importResult:", dump_json(importResult)
            hex_key = importResult['destination_frames'][0]

            if CAUSE_FAIL:
                frames_result = a_node.frames(key=k,
                                              row_count=5,
                                              timeoutSecs=timeoutSecs)
            # print "frames_result from the first importResult key", dump_json(frames_result)

            parseResult = a_node.parse(key=hex_key,
                                       timeoutSecs=timeoutSecs,
                                       chunk_size=4194304 * 4)
            pA = h2o_cmd.ParseObj(parseResult)
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=581012 * multiplyExpected,
                                    expectedNumCols=55,
                                    expectedMissinglist=[])
            print iA.missingList, iA.labelList, iA.numRows, iA.numCols

            for i in range(0):
                print "Summary on column", i
                co = h2o_cmd.runSummary(key=hex_key, column=i)

            k = parseResult['frames'][0]['frame_id']['name']
            # print "parseResult:", dump_json(parseResult)
            frames_result = a_node.frames(key=k, row_count=5)
            # print "frames_result from the first parseResult key", dump_json(frames_result)

            parseKeyIndexedCheck(frames_result, multiplyExpected)
コード例 #54
0
ファイル: genJson.py プロジェクト: zjliang/h2o-2
def getSummaries():
    with open('./smalldata.csv', 'rb') as f:
        for line in f:
            PATHS = []
            NAMES = []
            NUMCOLS = 0
            NUMROWS = 0
            TYPES = []
            RANGES = []
            IGNORED = 'NA'
            TARGET = 'NA'

            DATANAME, uploadPath, importPath, importHDFS, fullPath = line.strip(
                "\n").split(',')
            PATHS = [uploadPath, importPath, importHDFS]

            bucket = 'smalldata'
            path = '/'.join(importPath.split('/')[2:]).strip('"')
            parseResult = h2i.import_parse(bucket=bucket,
                                           path=path,
                                           schema='local',
                                           doSummary=False)
            summary = h2o_cmd.runSummary(key=parseResult['destination_key'])
            columns = summary['summary']['columns']

            NUMCOLS = len(columns)
            NUMROWS = columns[0]['N']
            for col in columns:
                NAMES.append('\"' + col['name'] + '\"')
                TYPES.append('\"' + col['type'] + '\"')
                tup = '(' + '"' + str(min(col['min'])) + '"' + ',' + '"' + str(
                    max(col['max'])
                ) + '"' + ')' if col['type'] == 'number' else '("NA", "NA")'
                RANGES += [tup]

            toJson(DATANAME,
                   PATHS,
                   NAMES,
                   NUMCOLS,
                   NUMROWS,
                   TYPES,
                   RANGES,
                   IGNORED="NA",
                   TARGET="NA")
コード例 #55
0
    def test_speedrf_covtype_fvec(self):
        importFolderPath = "standard"

        # Parse Train ******************************************************
        # csvTrainFilename = 'covtype.data'
        csvTrainFilename = 'covtype20x.data'
        csvTrainPathname = importFolderPath + "/" + csvTrainFilename
        hex_key = csvTrainFilename + ".hex"
        parseTrainResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvTrainPathname, hex_key=hex_key,
            timeoutSecs=180, doSummary=False)
        inspect = h2o_cmd.runInspect(None, parseTrainResult['destination_key'])

        xList = []
        eList = []
        fList = []
        trial = 0
        for trial in range(10):
            timeoutSecs = 30
            # have unique model names
            start = time.time()
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print 'summary end', trial, 'on', csvTrainPathname, 'took', elapsed, 'seconds'

            fList.append(elapsed)
            eList.append(elapsed)

            if DO_PLOT:
                xLabel = 'trial'
                xList.append(trial)

        if DO_PLOT:
            eLabel = 'elapsed'
            fLabel = 'elapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
コード例 #56
0
        def do_summary_and_inspect():
            # SUMMARY******************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            coltypeList = h2o_cmd.infoFromSummary(summaryResult)

            # INSPECT******************************************
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Now check both inspect and summary
            if csvFilename == 'covtype.binary.svm':
                for k in range(55):
                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0,
                                     naCnt,
                                     msg='col %s naCnt %d should be %s' %
                                     (k, naCnt, 0))
                    stype = inspect['cols'][k]['type']
                    print k, stype
                    self.assertEqual('Int',
                                     stype,
                                     msg='col %s type %s should be %s' %
                                     (k, stype, 'Int'))

                # summary may report type differently than inspect..check it too!
                # we could check na here too
                for i, c in enumerate(coltypeList):
                    print "column index: %s  column type: %s" % (i, c)
                    # inspect says 'int?"
                    assert c == 'Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (
                        i, c)
コード例 #57
0
ファイル: h2o_kmeans.py プロジェクト: zhuyuecai/h2o
def bigCheckResults(self, kmeans, csvPathname, parseResult,
                    applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        # can't use inspect on a model key? now?
        model = kmeans['model']
        model_key = model['_key']
        centers = model['centers']
        cluster_variances = model["within_cluster_variances"]
        error = model["total_within_SS"]
        kmeansResult = kmeans
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        h2o.verboseprint('kmeans result:', h2o.dump_json(kmeansResult))
        model = kmeansResult['KMeansModel']
        centers = model['clusters']
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult['destination_key'],
            model_key=model_key,
            destination_key=predictKey)
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult['summaries'][0]['hcnt']  # histogram
        rows_per_cluster = hcnt
        # FIX! does the cluster order/naming match, compared to cluster variances
        sqr_error_per_cluster = cluster_variances

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult['destination_key'],
            model_key=model_key,
            destination_key=applyDestinationKey)
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(
            key=parseResult['destination_key'], model_key=model_key)
        score = kmeansScoreResult['score']
        rows_per_cluster = score['rows_per_cluster']
        sqr_error_per_cluster = score['sqr_error_per_cluster']

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(
            i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append(
            (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
コード例 #58
0
    def test_c5_KMeans_sphere_67MB_fvec(self):
        h2o.beta_features = True
        # a kludge
        h2o.setup_benchmark_log()

        csvFilename = 'syn_sphere_gen_h1m_no_na.csv'
        totalBytes = 67306997
        if FROM_HDFS:
            importFolderPath = "datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename
        else:
            importFolderPath = "/home3/0xdiag/datasets/kmeans_big"
            csvPathname = importFolderPath + '/' + csvFilename

        # FIX! put right values in
        # will there be different expected for random vs the other inits?
        expected = [
            ([0.0, -113.00566692375459, -89.99595447985321, -455.9970643424373, 4732.0, 49791778.0, 36800.0], 248846122, 1308149283316.2988) ,
            ([0.0, 1.0, 1.0, -525.0093818313685, 2015.001629398412, 25654042.00592703, 28304.0], 276924291, 1800760152555.98) ,
            ([0.0, 5.0, 2.0, 340.0, 1817.995920197288, 33970406.992053084, 31319.99486705394], 235089554, 375419158808.3253) ,
            ([0.0, 10.0, -72.00113070337981, -171.0198611715457, 4430.00952228909, 37007399.0, 29894.0], 166180630, 525423632323.6474) ,
            ([0.0, 11.0, 3.0, 578.0043558141306, 1483.0163188052604, 22865824.99639042, 5335.0], 167234179, 1845362026223.1094) ,
            ([0.0, 12.0, 3.0, 168.0, -4066.995950679284, 41077063.00269915, -47537.998050740985], 195420925, 197941282992.43475) ,
            ([0.0, 19.00092954923767, -10.999565572612255, 90.00028669073289, 1928.0, 39967190.0, 27202.0], 214401768, 11868360232.658035) ,
            ([0.0, 20.0, 0.0, 141.0, -3263.0030236302937, 6163210.990273981, 30712.99115201907], 258853406, 598863991074.3276) ,
            ([0.0, 21.0, 114.01584574295777, 242.99690338815898, 1674.0029079209912, 33089556.0, 36415.0], 190979054, 1505088759456.314) ,
            ([0.0, 25.0, 1.0, 614.0032787274755, -2275.9931284021022, -48473733.04122273, 47343.0], 87794427, 1124697008162.3955) ,
            ([0.0, 39.0, 3.0, 470.0, -3337.9880599007597, 28768057.98852736, 16716.003410920028], 78226988, 1151439441529.0215) ,
            ([0.0, 40.0, 1.0, 145.0, 950.9990795199593, 14602680.991458317, -14930.007919032574], 167273589, 693036940951.0249) ,
            ([0.0, 42.0, 4.0, 479.0, -3678.0033024834297, 8209673.001421165, 11767.998552236539], 148426180, 35942838893.32379) ,
            ([0.0, 48.0, 4.0, 71.0, -951.0035145455234, 49882273.00063991, -23336.998167498707], 157533313, 88431531357.62982) ,
            ([0.0, 147.00394564757505, 122.98729664236723, 311.0047920137008, 2320.0, 46602185.0, 11212.0], 118361306, 1111537045743.7646) ,
        ]

        benchmarkLogging = ['cpu','disk', 'network', 'iostats', 'jstack']
        benchmarkLogging = ['cpu','disk', 'network', 'iostats']
        # IOStatus can hang?
        benchmarkLogging = ['cpu', 'disk', 'network']
        benchmarkLogging = []

        for trial in range(6):
            # IMPORT**********************************************
            # since H2O deletes the source key, re-import every iteration.
            # PARSE ****************************************
            print "Parse starting: " + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            timeoutSecs = 2 * 3600
            kwargs = {}
            if FROM_HDFS:
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            else:
                parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, pollTimeoutSecs=60, retryDelaySecs=2,
                    benchmarkLogging=benchmarkLogging, **kwargs)

            elapsed = time.time() - start
            fileMBS = (totalBytes/1e6)/elapsed
            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:6.2f} MB/sec for {:.2f} secs'.format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'Parse', csvPathname, fileMBS, elapsed)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # clear out all NAs (walk across cols)..clear to 0
            # temp
            ## execExpr = '%s=apply(%s,2,function(x){ifelse(is.na(x),0,x)})' % (hex_key, hex_key)
            ## h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=None, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            summary = h2o_cmd.runSummary(key=hex_key, timeoutSecs=500)
            h2o_cmd.infoFromSummary(summary)

            # KMeans ****************************************
            if not DO_KMEANS:
                continue

            print "col 0 is enum in " + csvFilename + " but KMeans should skip that automatically?? or no?"
            kwargs = {
                'k': 15, 
                'max_iter': 10,
                'normalize': 1,
                'initialization': 'Furthest',
                'destination_key': 'junk.hex', 
                # reuse the same seed, to get deterministic results
                'seed': 265211114317615310,
                # 'ignored_cols': 'C0', # get NaNs if col with all NAs is left in. the exec2 clear doesn't seem to work
                }

            if (trial%3)==0:
                kwargs['initialization'] = 'PlusPlus'
            elif (trial%3)==1:
                kwargs['initialization'] = 'Furthest'
            else:
                kwargs['initialization'] = None

            timeoutSecs = 4 * 3600
            params = kwargs
            paramsString = json.dumps(params)

            start = time.time()
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs,
                    benchmarkLogging=benchmarkLogging, **kwargs)
            elapsed = time.time() - start
            print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            print "kmeans result:", h2o.dump_json(kmeans)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:s} for {:.2f} secs {:s}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, "KMeans", "trial "+str(trial), csvFilename, elapsed, paramsString)
            print l
            h2o.cloudPerfH2O.message(l)

            (centers, tupleResultList)  = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
            # all are multipliers of expected tuple value
            allowedDelta = (0.01, 0.01, 0.01) 
            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, allowError=True, trial=trial)
            h2i.delete_keys_at_all_nodes()