def test_parse_1m_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [(10, 65000, "cH", 30)]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            print "Summary should work with 65k"
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True
            )
            print csvFilename, "parse time:", parseResult["response"]["time"]
            print "Parse and summary:", parseResult["destination_key"], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, "    num_rows:", "{:,}".format(
                inspect["num_rows"]
            ), "    num_cols:", "{:,}".format(inspect["num_cols"])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect["num_cols"],
                colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect["num_cols"], colCount),
            )
            self.assertEqual(
                inspect["num_rows"],
                rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect["num_rows"], rowCount),
            )

            # we should obey max_column_display
            column_limits = [25, 25000, 50000]
            for column_limit in column_limits:
                inspect = h2o_cmd.runInspect(
                    None, parseResult["destination_key"], max_column_display=column_limit, timeoutSecs=timeoutSecs
                )
                self.assertEqual(
                    len(inspect["cols"]), column_limit, "inspect obeys max_column_display = " + str(column_limit)
                )
                for r in range(0, len(inspect["rows"])):
                    # NB: +1 below because each row includes a row header row: #{row}
                    self.assertEqual(
                        len(inspect["rows"][r]),
                        column_limit + 1,
                        "inspect data rows obeys max_column_display = " + str(column_limit),
                    )
Exemple #2
0
    def test_from_import_fvec(self):
        csvFilenameAll = [
            ("covtype.data", 500),
            # ("covtype20x.data", 1000),
            ]

        for (csvFilename, timeoutSecs) in csvFilenameAll:
            # creates csvFilename.hex from file in importFolder dir 
            hex_key = csvFilename + '.hex'
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path="standard/" + csvFilename, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse result['destination_key']:", parseResult['destination_key']

            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], verbose=True)
            h2o_cmd.infoFromInspect(inspect, parseResult['destination_key'])

            summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
            # h2o_cmd.infoFromSummary(summaryResult)

            trees = 2
            start = time.time()
            rfView = h2o_cmd.runRF(trees=trees, max_depth=20, balance_classes=0, importance=1, parseResult=parseResult, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start

            (classification_error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView, ntree=trees)

            l = '{!s} jvms, {!s}GB heap, {:s} {:s} {:.2f} secs. \
                trees: {:} classification_error: {:} classErrorPct: {:} totalScores: {:}' .format(
                len(h2o.nodes), h2o.nodes[0].java_heap_GB, 'DRF2', csvFilename, elapsed, 
                    trees, classification_error, classErrorPctList, totalScores)
            print "\n"+l
            h2o.cloudPerfH2O.message(l)

            # just to make sure we test this
            h2i.delete_keys_at_all_nodes(pattern=hex_key)
    def parseFile(self,
                  importFolderPath='datasets',
                  csvFilename='airlines_all.csv',
                  timeoutSecs=500,
                  **kwargs):
        csvPathname = importFolderPath + "/" + csvFilename

        start = time.time()
        parseResult = h2i.import_parse(path=csvPathname,
                                       schema='hdfs',
                                       timeoutSecs=timeoutSecs)
        elapsed = time.time() - start
        print "Parse of", parseResult[
            'destination_key'], "took", elapsed, "seconds"
        parseResult['python_call_timer'] = elapsed
        print "Parse result['destination_key']:", parseResult[
            'destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None,
                                     parseResult['destination_key'],
                                     timeoutSecs=200)
        elapsed = time.time() - start
        print "Inspect:", parseResult[
            'destination_key'], "took", elapsed, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        numRows = inspect['numRows']
        numCols = inspect['numCols']
        print "numRows:", numRows, "numCols", numCols
        return parseResult
    def test_many_cols_and_types(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5, 'cA', 5),
            (1000, 59, 'cB', 5),
            (5000, 128, 'cC', 5),
            (6000, 507, 'cD', 5),
            (9000, 663, 'cE', 5),
            ]
        
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            print "\n" + csvFilename
    def test_rf_allyears2k_oobe(self):
        importFolderPath = '/home/0xdiag/datasets'
        csvFilename = 'allyears2k.csv'
        csvPathname = importFolderPath + "/" + csvFilename
        h2i.setupImportFolder(None, importFolderPath)
        parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, timeoutSecs=60)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        for trial in range(10):
            kwargs = paramDict
            timeoutSecs = 30 + kwargs['ntree'] * 2

            start = time.time()
            # randomize the node
            node = h2o.nodes[random.randint(0,len(h2o.nodes)-1)]
            rfView = h2o_cmd.runRFOnly(node=node, parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "RF end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

            classification_error = rfView['confusion_matrix']['classification_error']
            rows_skipped = rfView['confusion_matrix']['rows_skipped']
            mtry = rfView['mtry']
            mtry_nodes = rfView['mtry_nodes']
            print "mtry:", mtry
            print "mtry_nodes:", mtry_nodes
            self.assertEqual(classification_error, 0, "Should have zero oobe error")
            self.assertEqual(rows_skipped, 39, "Should have exactly 39 rows skipped")

            print "Trial #", trial, "completed"
    def test_KMeans_covtype_cols_fvec(self):
        h2o.beta_features = True
        # just do the import folder once
        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("covtype.binary.svm", "cC", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        importFolderPath = "libsvm"
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            csvPathname = importFolderPath + "/" + csvFilename

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir 
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, 
                hex_key=hex_key, timeoutSecs=2000)
            print "Parse result['destination_key']:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # KMEANS******************************************
            for trial in range(1):
                kwargs = {
                    'k': 3, 
                    'initialization': 'Furthest',
                    'ignored_cols': range(11, numCols),
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                for trial2 in range(3):
                    timeoutSecs = 600
                    start = time.time()
                    kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
                    elapsed = time.time() - start
                    print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                        "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                    # this does an inspect of the model and prints the clusters
                    h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                    (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvPathname, parseResult, 'd', **kwargs)
Exemple #7
0
def bigCheckResults(self, kmeans, csvPathname, parseKey, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    model_key = kmeans['destination_key']
    kmeansResult = h2o_cmd.runInspect(key=model_key)
    centers = kmeansResult['KMeansModel']['clusters']

    kmeansApplyResult = h2o.nodes[0].kmeans_apply(
        data_key=parseKey['destination_key'], model_key=model_key,
        destination_key=applyDestinationKey)
    inspect = h2o_cmd.runInspect(None, applyDestinationKey)
    h2o_cmd.infoFromInspect(inspect, csvPathname)

    kmeansScoreResult = h2o.nodes[0].kmeans_score(
        key=parseKey['destination_key'], model_key=model_key)
    score = kmeansScoreResult['score']
    rows_per_cluster = score['rows_per_cluster']
    sqr_error_per_cluster = score['sqr_error_per_cluster']

    tupleResultList = []
    for i,c in enumerate(centers):
        print "\ncenters["+str(i)+"]: ", centers[i]
        print "rows_per_cluster["+str(i)+"]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster["+str(i)+"]: ", sqr_error_per_cluster[i]
        tupleResultList.append( (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]) )

    return (centers, tupleResultList)
Exemple #8
0
    def test_storeview_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            importResult = h2i.import_only(bucket='home-0xdiag-datasets', path="*", timeoutSecs=timeoutSecs)
            print h2o.dump_json(importResult)
            storeViewResult = h2o_cmd.runStoreView(timeoutSecs=30)
            # print h2o.dump_json(storeViewResult)

            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local',
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def import_frame(self, target_key, bucket, csvFilename, csvPathname,
                     expected_rows, expected_cols):
        path = csvPathname + '/' + csvFilename
        parseResult = h2i.import_parse(bucket=bucket,
                                       path=path,
                                       hex_key=target_key,
                                       schema='put')  # upload the file
        destination_key = parseResult[
            'destination_key']  # we block until it's actually ready

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        actual_rows = inspect['numRows']
        actual_cols = inspect['numCols']

        print 'loaded frame "' + target_key + '" from path: ' + path
        print 'rows: ', actual_rows
        print 'cols: ', actual_cols

        # Don't have access to the testCase assert methods here because they aren't class methods. :-(
        assert expected_rows == actual_rows, "Expected " + str(
            expected_rows) + " but got " + str(
                actual_rows) + " for path: " + path
        assert expected_cols == actual_cols, "Expected " + str(
            expected_cols) + " but got " + str(
                actual_cols) + " for path: " + path

        # TODO: other info we could check
        # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
        #     h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
        #
        # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        # h2o_cmd.infoFromSummary(summaryResult) # , noPrint=True
        return destination_key
    def test_parse_200k_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 100000, 'cA', 200, 200),
            # (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           doSummary=False)
            print "Parse:", parseResult['destination_key'], "took", time.time(
            ) - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            # if not h2o.browse_disable:
            #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
            #    time.sleep(5)
            h2i.delete_keys_at_all_nodes()
Exemple #11
0
def bigCheckResults(self, kmeans, csvPathname, parseKey, applyDestinationKey,
                    **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    model_key = kmeans['destination_key']
    kmeansResult = h2o_cmd.runInspect(key=model_key)
    centers = kmeansResult['KMeansModel']['clusters']

    kmeansApplyResult = h2o.nodes[0].kmeans_apply(
        data_key=parseKey['destination_key'],
        model_key=model_key,
        destination_key=applyDestinationKey)
    inspect = h2o_cmd.runInspect(None, applyDestinationKey)
    h2o_cmd.infoFromInspect(inspect, csvPathname)

    kmeansScoreResult = h2o.nodes[0].kmeans_score(
        key=parseKey['destination_key'], model_key=model_key)
    score = kmeansScoreResult['score']
    rows_per_cluster = score['rows_per_cluster']
    sqr_error_per_cluster = score['sqr_error_per_cluster']

    tupleResultList = []
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", centers[i]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(
            i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append(
            (centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
def createTestTrain(srcKey, trainDstKey, testDstKey, percent, outputClass, numCols):
    # will have to live with random extract. will create variance
    print "train: get random %. change class 4 to 1, everything else to 0. factor() to turn real to int (for rf)"


    # Create complexity for no good reason!. Do the same thing 5 times in the single exec expressions
    execExpr = ""
    STUPID_REPEAT = 20
    for i in range(STUPID_REPEAT):
        execExpr += "a.hex=runif(%s);" % srcKey
        execExpr += "%s=%s[a.hex%s,];" % (trainDstKey, srcKey, '<=0.9')
        if not DO_MULTINOMIAL:
            execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, numCols, trainDstKey, numCols, outputClass)
            execExpr +=  "factor(%s[, %s]);" % (trainDstKey, numCols)

    h2o_exec.exec_expr(None, execExpr, resultKey=trainDstKey, timeoutSecs=STUPID_REPEAT * 15)

    inspect = h2o_cmd.runInspect(key=trainDstKey)
    h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) )

    print "test: same, but use the same runif() random result, complement"

    execExpr = "a.hex=runif(%s);" % srcKey
    execExpr += "%s=%s[a.hex%s,];" % (testDstKey, srcKey, '>0.9')
    if not DO_MULTINOMIAL:
        execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, numCols, testDstKey, numCols, outputClass)
        execExpr +=  "factor(%s[, %s])" % (testDstKey, numCols)
    h2o_exec.exec_expr(None, execExpr, resultKey=testDstKey, timeoutSecs=10)

    inspect = h2o_cmd.runInspect(key=testDstKey)
    h2o_cmd.infoFromInspect(inspect, "%s after mungeDataset on %s" % (testDstKey, srcKey) )
    def test_one_hot_expand_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 1100, 'cA', 5),
            (100, 1000, 'cB', 5),
            (100, 900, 'cC', 5),
            (100, 800, 'cD', 5),
            (100, 700, 'cE', 5),
            (100, 600, 'cF', 5),
            (100, 500, 'cG', 5),
            ]

        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        cnum = 0
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            cnum += 1
            csvFilename = 'syn_' + str(SEED) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEED)

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=10)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # does it modify the original or ?
            oneHotResult = h2o.nodes[0].one_hot(source=parseResult['destination_key'])

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
Exemple #14
0
    def test_c7_rel(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=True)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        print "Parse result['destination_key']:", parseResult['destination_key']

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        # num_rows = inspect['num_rows']
        # num_cols = inspect['num_cols']

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "x:", x

        kwargs = {
            'x': x,
            'y': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600

        if DO_GLM:
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

        # do summary of the parsed dataset last, since we know it fails on this dataset
        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult  = h2i.import_parse(path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)

        csvPathname_test  = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
Exemple #16
0
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, 
    csvSrcOutputPathname, csvPredictPathname, 
    skipSrcOutputHeader, skipPredictHeader,
    translate=None, y=0):
    # have to slice out col 0 (the output) and feed result to predict
    # cols are 0:784 (1 output plus 784 input features
    # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
    dataKey = "P.hex"
    h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
    if skipSrcOutputHeader:
        print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
        print "hack for now, can't chop out col 0 in Exec currently"
        dataKey = hex_key
    else:
        print "No header in dataset, can't chop out cols, since col numbers are used for names"
        dataKey = hex_key

    # +1 col index because R-like
    h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

    start = time.time()
    predict = h2o.nodes[0].generate_predictions(model_key=model_key,
        data_key=hex_key, destination_key=predictHexKey)
    print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds'
    h2o.check_sandbox_for_errors()
    inspect = h2o_cmd.runInspect(key=predictHexKey)
    h2o_cmd.infoFromInspect(inspect, 'predict.hex')

    h2o.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
    h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
    h2o.check_sandbox_for_errors()

    print "Do a check of the original output col against predicted output"
    (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
        msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
    (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
        msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

    # no header on source
    if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
        raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
            %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

    wrong = 0
    for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
        # if float(o)!=float(p):
        if str(o)!=str(p):
            if wrong==10:
                print "Not printing any more mismatches\n"
            elif wrong<10:
                msg = "Comparing original output col vs predicted. row %s differs. \
                    original: %s predicted: %s"  % (rowNum, o, p)
                print msg
            wrong += 1

    print "\nTotal wrong:", wrong
    print "Total:", len(originalOutput)
    pctWrong = (100.0 * wrong)/len(originalOutput)
    print "wrong/Total * 100 ", pctWrong
    return pctWrong
    def test_many_cols_and_types(self):
        SEED = random.randint(0, sys.maxint)
        print "\nUsing random seed:", SEED
        # SEED =
        random.seed(SEED)
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5, "cA", 5),
            (1000, 59, "cB", 5),
            (5000, 128, "cC", 5),
            (6000, 507, "cD", 5),
            (9000, 663, "cE", 5),
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=30)
            print csvFilename, "parse time:", parseKey["response"]["time"]
            print "Parse result['destination_key']:", parseKey["destination_key"]
            inspect = h2o_cmd.runInspect(None, parseKey["destination_key"])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            print "\n" + csvFilename
Exemple #18
0
    def test_big_sum_fail(self):
        node = h2o.nodes[0]
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvPathname = SYNDATASETS_DIR + '/temp.csv'
        hex_key = 'temp.hex'
        for trial in range(5):
            # what about seed?
            cfResult = h2o.nodes[0].create_frame(key=hex_key,
                binary_ones_fraction=0.02, binary_fraction=0, randomize=1, 
                missing_fraction=0, integer_fraction=1, real_range=100,
                has_response=0, response_factors=2, factors=100, cols=1, 
                integer_range=100, value=0, categorical_fraction=0, rows=2.5e+08, 
                timeoutSecs=300)

            inspect = h2o_cmd.runInspect(key=hex_key)
            h2o_cmd.infoFromInspect(inspect, hex_key)

            if UNNECESSARY:
                # this is just doing a head to R. not critical
                h2e.exec_expr(execExpr="%s = %s" % (hex_key, hex_key))
                h2e.exec_expr(execExpr="Last.value.0 = %s[c(1,2,3,4,5,6),]" % hex_key)
                h2e.exec_expr(execExpr="Last.value.0 = Last.value.0")
                node.csv_download(src_key="Last.value.0", csvPathname=csvPathname)
                node.remove_key("Last.value.0")
                # not sure why this happened
                h2o_cmd.runStoreView(view=10000, offset=0)


            # Fails on this
            h2e.exec_expr(execExpr='Last.value.1 = %s[,1]' % hex_key)

            print "Trial #", trial, "completed"
Exemple #19
0
def predict_and_compare_csvs(model_key, hex_key, predictHexKey, 
    csvSrcOutputPathname, csvPredictPathname, 
    skipSrcOutputHeader, skipPredictHeader,
    translate=None, y=0):
    # have to slice out col 0 (the output) and feed result to predict
    # cols are 0:784 (1 output plus 784 input features
    # h2e.exec_expr(execExpr="P.hex="+hex_key+"[1:784]", timeoutSecs=30)
    dataKey = "P.hex"
    h2e.exec_expr(execExpr=dataKey+"="+hex_key, timeoutSecs=30) # unneeded but interesting
    if skipSrcOutputHeader:
        print "Has header in dataset, so should be able to chop out col 0 for predict and get right answer"
        print "hack for now, can't chop out col 0 in Exec currently"
        dataKey = hex_key
    else:
        print "No header in dataset, can't chop out cols, since col numbers are used for names"
        dataKey = hex_key

    # +1 col index because R-like
    h2e.exec_expr(execExpr="Z.hex="+hex_key+"[,"+str(y+1)+"]", timeoutSecs=30)

    start = time.time()
    predict = h2o_nodes.nodes[0].generate_predictions(model_key=model_key,
        data_key=hex_key, destination_key=predictHexKey)
    print "generate_predictions end on ", hex_key, " took", time.time() - start, 'seconds'
    check_sandbox_for_errors()
    inspect = h2o_cmd.runInspect(key=predictHexKey)
    h2o_cmd.infoFromInspect(inspect, 'predict.hex')

    h2o_nodes.nodes[0].csv_download(src_key="Z.hex", csvPathname=csvSrcOutputPathname)
    h2o_nodes.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
    check_sandbox_for_errors()

    print "Do a check of the original output col against predicted output"
    (rowNum1, originalOutput) = compare_csv_at_one_col(csvSrcOutputPathname,
        msg="Original", colIndex=0, translate=translate, skipHeader=skipSrcOutputHeader)
    (rowNum2, predictOutput)  = compare_csv_at_one_col(csvPredictPathname,
        msg="Predicted", colIndex=0, skipHeader=skipPredictHeader)

    # no header on source
    if ((rowNum1-skipSrcOutputHeader) != (rowNum2-skipPredictHeader)):
        raise Exception("original rowNum1: %s - %d not same as downloaded predict: rowNum2: %s - %d \
            %s" % (rowNum1, skipSrcOutputHeader, rowNum2, skipPredictHeader))

    wrong = 0
    for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
        # if float(o)!=float(p):
        if str(o)!=str(p):
            if wrong==10:
                print "Not printing any more mismatches\n"
            elif wrong<10:
                msg = "Comparing original output col vs predicted. row %s differs. \
                    original: %s predicted: %s"  % (rowNum, o, p)
                print msg
            wrong += 1

    print "\nTotal wrong:", wrong
    print "Total:", len(originalOutput)
    pctWrong = (100.0 * wrong)/len(originalOutput)
    print "wrong/Total * 100 ", pctWrong
    return pctWrong
Exemple #20
0
    def import_frame(self, target_key, bucket, csvFilename, csvPathname, expected_rows, expected_cols):
        path = csvPathname + '/' + csvFilename
        parseResult = h2i.import_parse(bucket=bucket, path=path, hex_key=target_key, schema='put') # upload the file
        destination_key = parseResult['destination_key']  # we block until it's actually ready

        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        actual_rows = inspect['numRows']
        actual_cols = inspect['numCols']

        print 'loaded frame "' + target_key +'" from path: ' + path
        print 'rows: ', actual_rows
        print 'cols: ', actual_cols

        # Don't have access to the testCase assert methods here because they aren't class methods. :-(
        assert expected_rows == actual_rows, "Expected " + str(expected_rows) + " but got " + str(actual_rows) + " for path: " + path
        assert expected_cols == actual_cols, "Expected " + str(expected_cols) + " but got " + str(actual_cols) + " for path: " + path

        # TODO: other info we could check
        # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
        #     h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=True)
        # 
        # summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'])
        # h2o_cmd.infoFromSummary(summaryResult) # , noPrint=True
        return destination_key
        def do_summary_and_inspect():
            # SUMMARY******************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            coltypeList = h2o_cmd.infoFromSummary(summaryResult)

            # INSPECT******************************************
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            numRows = inspect['numRows']
            numCols = inspect['numCols']

            # Now check both inspect and summary
            if csvFilename=='covtype.binary.svm':
                for k in range(55):
                    naCnt = inspect['cols'][k]['naCnt']
                    self.assertEqual(0, naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, 0))
                    stype = inspect['cols'][k]['type']
                    print k, stype
                    self.assertEqual('Int', stype, msg='col %s type %s should be %s' % (k, stype, 'Int'))

                # summary may report type differently than inspect..check it too!
                # we could check na here too
                for i,c in enumerate(coltypeList):
                    print "column index: %s  column type: %s" % (i, c)
                    # inspect says 'int?"
                    assert c=='Numeric', "All cols in covtype.binary.svm should be parsed as Numeric! %s %s" % (i,c)
Exemple #22
0
    def test_randomFilter(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        # use SEED so the file isn't cached?
        csvFilenameAll = [
            ('syn_1mx8_' + str(SEED) + '.csv', 'cA', 5),
            ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        csvFilenameList = csvFilenameAll
        ### h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)
        for (csvFilename, key2, timeoutSecs) in csvFilenameList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            print "Creating random 1mx8 csv"
            write_syn_dataset(csvPathname, 1000000, SEEDPERFILE)
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2o_cmd.parseFile(None, csvPathname, key2=key2, timeoutSecs=2000)

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            print "\n" + csvFilename
            h2e.exec_zero_list(zeroList)
            # does n+1 so use maxCol 6
            h2e.exec_expr_list_rand(lenNodes, exprList, key2, 
                maxCol=6, maxRow=400000, maxTrials=200, timeoutSecs=timeoutSecs)
    def test_rf_big_rand_tree_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowCount = 5000
        colCount = 1000
        write_syn_dataset(csvPathname, rowCount, colCount)

        for trial in range (1):
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = csvFilename + "_" + str(trial)
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            seed = random.randint(0,sys.maxint)
            # some cols can be dropped due to constant 0 or 1. make sure data set has all 0's and all 1's above
            # to guarantee no dropped cols!
            # kwargs = {'ntree': 3, 'depth': 50, 'seed': seed}
            # out of memory/GC errors with the above. reduce depth
            kwargs = {'ntrees': 3, 'max_depth': 20, 'seed': seed}
            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=90)
            h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=600, pollTimeoutSecs=180, **kwargs)
            print "trial #", trial, "rowCount:", rowCount, "colCount:", colCount, "RF end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            inspect = h2o_cmd.runInspect(key=hex_key)
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            cols = inspect['cols']
            numCols = inspect['numCols']
            for i,c in enumerate(cols):
                colType = c['type']
                self.assertEqual(colType, 'Int', msg="col %d should be type in: %s" % (i, colType))

            h2o.check_sandbox_for_errors()
    def test_many_cols_and_types(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 5, 'cA', 5),
            (1000, 59, 'cB', 5),
            (5000, 128, 'cC', 5),
            (6000, 507, 'cD', 5),
            (9000, 663, 'cE', 5),
        ]

        for (rowCount, colCount, key2, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            csvFilename = "syn_%s_%s_%s.csv" % (SEEDPERFILE, rowCount,
                                                colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=30)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey[
                'destination_key']
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            print "\n" + csvFilename
    def test_parse_summary_c21(self):
        importFolderPath = '/mnt/0xcustomer-datasets/c21'
        timeoutSecs = 300

        csvPathname_train = importFolderPath + '/persona_clean_deep.tsv.zip'
        hex_key = 'train.hex'
        parseResult = h2i.import_parse(path=csvPathname_train,
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_train)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_train, missingValuesList)
        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key)
        h2o_cmd.infoFromSummary(rSummary, rows=numRows, cols=numCols)

        csvPathname_test = importFolderPath + '/persona_clean_deep.tsv.zip'
        validation_key = 'test.hex'
        parseResult = h2i.import_parse(path=csvPathname_test,
                                       hex_key=validation_key,
                                       timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(key=hex_key)
        missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathname_test)
        # self.assertEqual(missingValuesList, [], "%s should have 3 cols of NAs: %s" % (csvPathname_test, missingValuesList)

        numCols = inspect['numCols']
        numRows = inspect['numRows']
        rSummary = h2o_cmd.runSummary(key=hex_key, rows=numRows, cols=numCols)
        h2o_cmd.infoFromSummary(rSummary)
    def test_plot_remove_keys_manyfiles(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()

        print "Remember, the parse only deletes what got parsed. We import the folder. So we double import. That should work now"
        tryList = [
            ("file_1[0-9].dat.gz", 'c10', 600),
            ("file_[1-2][0-9].dat.gz", 'c20', 600),
            ("file_[1-4][0-9].dat.gz", 'c40', 600),
            ("file_[1-8][0-9].dat.gz", 'c80', 600),
            # don't do this case. timesout at 300 sec on polling with 172-180
            # ("file_[1-2][1-8][0-9].dat.gz", 'c160', 1200),
        ]
        
        xList = []
        eList = []
        fList = []
        importFolderPath = "manyfiles-nflx-gz"
        for (csvFilePattern, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvPathname = importFolderPath + "/" + csvFilePattern
            start = time.time()
            parseResult = h2i.import_parse(bucket="home-0xdiag-datasets", path=csvPathname, hex_key=hex_key, 
                retryDelaySecs=3, timeoutSecs=timeoutSecs, doSummary=False)
            parseElapsed = time.time() - start
            print "Parse only:", parseResult['destination_key'], "took", parseElapsed, "seconds"
            h2o.check_sandbox_for_errors()

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            parsedBytes = inspect['byteSize']

            node = h2o.nodes[0]
            print "Deleting", hex_key, "at", node.http_addr, "Shouldn't matter what node the delete happens at..global?"
            start = time.time()
            node.remove_key(hex_key, timeoutSecs=30)
            removeElapsed = time.time() - start
            print "Deleting", hex_key, "took", removeElapsed, "seconds"

            # xList.append(ntrees)
            xList.append(parsedBytes)
            eList.append(parseElapsed)
            fList.append(removeElapsed)

        # just plot the last one
        if 1==1:
            xLabel = 'parsedBytes'
            eLabel = 'parseElapsed'
            fLabel = 'removeElapsed'
            eListTitle = ""
            fListTitle = ""
            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
Exemple #27
0
    def test_libsvm(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        for trial in range(2):
            csvFilename = "syn_ints.csv"
            hex_key = "1.hex"
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
            write_syn_dataset(csvPathname, trial)
            timeoutSecs = 10

            # have to import each time, because h2o deletes source after parse

            # PARSE******************************************
            # creates csvFilename.hex from file in importFolder dir
            # parseResult = h2i.import_parse(path=csvPathname, parser_type='SVMLight', hex_key=hex_key, timeoutSecs=2000)
            parseResult = h2i.import_parse(parser_type=PARSER_TYPE,
                                           path=csvPathname,
                                           hex_key=hex_key,
                                           timeoutSecs=2000)

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(key=hex_key, timeoutSecs=360)
            print "Inspect:", hex_key, "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            summaryResult = h2o_cmd.runSummary(key=hex_key)
            h2o_cmd.infoFromSummary(summaryResult)

            if DO_KMEANS:
                # KMEANS******************************************
                kwargs = {
                    'k': 3,
                    'initialization': 'Furthest',
                    'ignored_cols':
                    None,  #range(11, numCols), # THIS BREAKS THE REST API
                    'max_iter': 10,
                    # 'normalize': 0,
                    # reuse the same seed, to get deterministic results (otherwise sometimes fails
                    'seed': 265211114317615310,
                }

                # fails if I put this in kwargs..i.e. source = dest
                # 'destination_key': parseResult['destination_key'],

                timeoutSecs = 600
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult,
                                           timeoutSecs=timeoutSecs,
                                           **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # this does an inspect of the model and prints the clusters
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                (centers, tupleResultList) = h2o_kmeans.bigCheckResults(
                    self, kmeans, csvPathname, parseResult, 'd', **kwargs)
    def test_parse_200k_cols_fvec(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 1000, 'cA', 200, 200),
            (10, 2000, 'cA', 200, 200),
            (10, 4000, 'cA', 200, 200),
            (10, 8000, 'cA', 200, 200),
            (10, 9000, 'cA', 200, 200),
            (10, 10000, 'cA', 200, 200),
            # (10, 100000, 'cA', 200, 200),
            # (10, 200000, 'cB', 200, 200),
            # (10, 300000, 'cB', 200, 200),
            # we timeout/fail on 500k? stop at 200k
            # (10, 500000, 'cC', 200, 200),
            # (10, 1000000, 'cD', 200, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            # does it blow up if it sets columnNames?
            parseResult = h2i.import_parse(path=csvPathname, schema='local', hex_key=hex_key,
                timeoutSecs=timeoutSecs, doSummary=False, columnNames=None, intermediateResults=DO_INTERMEDIATE_RESULTS)
            print "Parse:", csvFilename, "took", time.time() - start, "seconds"

        
            print "Skipping the row/cols check for now"
            if 1==0:
                start = time.time()
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2)
                print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
                h2o_cmd.infoFromInspect(inspect, csvPathname)
                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(inspect['numRows']), \
                    "    numCols:", "{:,}".format(inspect['numCols'])

                # should match # of cols in header or ??
                self.assertEqual(inspect['numCols'], colCount,
                    "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
                self.assertEqual(inspect['numRows'], rowCount,
                    "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                    (inspect['numRows'], rowCount))

            print "Skipping the delete keys for now"
            if 1==0:
                # if not h2o.browse_disable:
                #    h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
                #    time.sleep(5)
                h2i.delete_keys_at_all_nodes()
    def test_frame_split_balance(self):
        h2o.beta_features = True

        csvFilename = 'covtype.data'
        csvPathname = 'standard/' + csvFilename
        hex_key = "covtype.hex"

        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       hex_key=hex_key,
                                       schema='local',
                                       timeoutSecs=20)

        print "Just split away and see if anything blows up"
        splitMe = hex_key
        inspect = h2o_cmd.runInspect(key=splitMe)
        origNumRows = inspect['numRows']
        origNumCols = inspect['numCols']
        for s in range(20):
            inspect = h2o_cmd.runInspect(key=splitMe)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            fs = h2o.nodes[0].frame_split(source=splitMe, ratios=0.5)
            split0_key = fs['split_keys'][0]
            split1_key = fs['split_keys'][1]
            split0_rows = fs['split_rows'][0]
            split1_rows = fs['split_rows'][1]
            split0_ratio = fs['split_ratios'][0]
            split1_ratio = fs['split_ratios'][1]
            print "Iteration", s, "split0_rows:", split0_rows, "split1_rows:", split1_rows
            splitMe = split1_key
            # split should be within 1 row accuracy. let's say within 20 for now
            self.assertLess(abs(split1_rows - split0_rows), 2)
            self.assertEqual(numRows, (split1_rows + split0_rows))
            self.assertEqual(numCols, origNumCols)
            if split0_rows <= 2:
                break

            print "Now do some rebalancing on the split frames"
            for trial in range(2):
                rb_key = "rb_%s_%s" % (trial, splitMe)
                SEEDPERFILE = random.randint(0, sys.maxint)
                randChunks = random.randint(1, 100)
                start = time.time()
                print "Trial %s: Rebalancing %s to %s with %s chunks" % (
                    trial, splitMe, rb_key, randChunks)
                rebalanceResult = h2o.nodes[0].rebalance(source=hex_key,
                                                         after=rb_key,
                                                         seed=SEEDPERFILE,
                                                         chunks=randChunks)
                elapsed = time.time() - start
                print "rebalance end on ", csvFilename, 'took', elapsed, 'seconds',\
                h2o_cmd.runSummary(key=rb_key)
                print "\nInspecting the original parsed result"
                inspect = h2o_cmd.runInspect(key=hex_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
                print "\nInspecting the rebalanced result with %s forced chunks" % randChunks
                inspect = h2o_cmd.runInspect(key=rb_key)
                h2o_cmd.infoFromInspect(inspect=inspect)
Exemple #30
0
    def test_c7_rel(self):
        h2o.beta_features = False
        print "Since the python is not necessarily run as user=0xcust..., can't use a  schema='put' here"
        print "Want to be able to run python as jenkins"
        print "I guess for big 0xcust files, we don't need schema='put'"
        print "For files that we want to put (for testing put), we can get non-private files"

        csvFilename = 'part-00000b'
        importFolderPath = '/mnt/0xcustomer-datasets/c2'
        csvPathname = importFolderPath + "/" + csvFilename

        # FIX! does 'separator=' take ints or ?? hex format
        # looks like it takes the hex string (two chars)
        start = time.time()
        # hardwire TAB as a separator, as opposed to white space (9)
        parseResult = h2i.import_parse(path=csvPathname, schema='local', timeoutSecs=500, separator=9, doSummary=False)
        print "Parse of", parseResult['destination_key'], "took", time.time() - start, "seconds"

        start = time.time()
        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=500)
        print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        num_rows = inspect['num_rows']
        num_cols = inspect['num_cols']
        print "\n" + csvFilename, "    num_rows:", "{:,}".format(num_rows), "    num_cols:", "{:,}".format(num_cols)

        summaryResult = h2o_cmd.runSummary(key=parseResult['destination_key'], numCols=num_cols, numRows=num_rows, max_column_display=2500)
        # it's in runSummary!
        # h2o_cmd.infoFromSummary(summaryResult, noPrint=False, numCols=num_cols, numRows=num_rows)

        keepPattern = "oly_|mt_|b_"
        y = "is_purchase"
        print "y:", y
        # don't need the intermediate Dicts produced from columnInfoFromInspect
        x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseResult['destination_key'], timeoutSecs=300)
        print "x:", x

        kwargs = {
            'x': x,
            'y': y,
            # 'case_mode': '>',
            # 'case': 0,
            'family': 'binomial',
            'lambda': 1.0E-5,
            'alpha': 0.5,
            'max_iter': 4,
            'n_folds': 1,
            'beta_epsilon': 1.0E-4,
            }

        timeoutSecs = 3600

        if DO_GLM:
            start = time.time()
            glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Exemple #31
0
    def test_many_cols_and_values_with_syn(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (100, 10000, 'cA', 60, 120),
            (100, 30000, 'cB', 60, 120),
            (100, 50000, 'cC', 60, 120),
            (100, 70000, 'cD', 60, 120),
            (100, 90000, 'cE', 60, 120),
            (100, 100000, 'cF', 60, 120),
        ]

        if not H2O_SUPPORTS_OVER_100K_COLS:
            print "Restricting number of columns tested to 100,000"
        else:
            tryList = tryList + [
                (100, 200000, 'cG', 60, 120),
                (100, 300000, 'cH', 60, 120),
                (100, 400000, 'cI', 60, 120),
                (100, 500000, 'cJ', 60, 120),
                (100, 600000, 'cK', 60, 120),
                (100, 700000, 'cL', 60, 120),
                (100, 800000, 'cM', 60, 120),
                (100, 900000, 'cN', 60, 120),
                (100, 1000000, 'cO', 60, 120),
            ]


        
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            sel = 0
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=True)
            h2o.check_sandbox_for_errors()
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))
        def predict_and_compare_csvs(model_key):
            start = time.time()
            predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey)
            print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname,
                msg="Original, after being exec'ed", skipHeader=True)
            (rowNum2, predictOutput)  = compare_csv_last_col(csvPredictPathname, 
                msg="Predicted", skipHeader=True)

            # no header on source
            if (rowNum1 != rowNum2):
                raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \
                    %s" % (rowNum1, rowNum2))

            wrong = 0
            wrong0 = 0
            wrong1 = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                o = float(o)
                p = float(p)
                if o!=p:
                    msg = "Comparing original output col vs predicted. row %s differs. \
                        original: %s predicted: %s"  % (rowNum, o, p)
                    if p==0.0 and wrong0==10:
                        print "Not printing any more predicted=0 mismatches"
                    elif p==0.0 and wrong0<10:
                        print msg
                    if p==1.0 and wrong1==10:
                        print "Not printing any more predicted=1 mismatches"
                    elif p==1.0 and wrong1<10:
                        print msg

                    if p==0.0:
                        wrong0 += 1
                    elif p==1.0:
                        wrong1 += 1

                    wrong += 1

            print "wrong0:", wrong0
            print "wrong1:", wrong1
            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 10.0:
                raise Exception("pct wrong too high. Expect < 10% error")
Exemple #33
0
        def predict_and_compare_csvs(model_key):
            start = time.time()
            predict = h2o_cmd.runPredict(model_key=model_key, data_key=hexKey, destination_key=predictHexKey)
            print "runPredict end on ", hexKey, " took", time.time() - start, 'seconds'
            h2o.check_sandbox_for_errors()
            inspect = h2o_cmd.runInspect(key=predictHexKey)
            h2o_cmd.infoFromInspect(inspect, 'predict.hex')

            h2o.nodes[0].csv_download(src_key=predictHexKey, csvPathname=csvPredictPathname)
            h2o.nodes[0].csv_download(src_key=execHexKey, csvPathname=csvExecPathname)
            h2o.check_sandbox_for_errors()

            print "Do a check of the original output col against predicted output"
            translate = {1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0}
            (rowNum1, originalOutput) = compare_csv_last_col(csvExecPathname,
                msg="Original, after being exec'ed", skipHeader=True)
            (rowNum2, predictOutput)  = compare_csv_last_col(csvPredictPathname, 
                msg="Predicted", skipHeader=True)

            # no header on source
            if (rowNum1 != rowNum2):
                raise Exception("original rowNum1: %s not same as downloaded predict (w/header) rowNum2: \
                    %s" % (rowNum1, rowNum2))

            wrong = 0
            wrong0 = 0
            wrong1 = 0
            for rowNum,(o,p) in enumerate(zip(originalOutput, predictOutput)):
                o = float(o)
                p = float(p)
                if o!=p:
                    msg = "Comparing original output col vs predicted. row %s differs. \
                        original: %s predicted: %s"  % (rowNum, o, p)
                    if p==0.0 and wrong0==10:
                        print "Not printing any more predicted=0 mismatches"
                    elif p==0.0 and wrong0<10:
                        print msg
                    if p==1.0 and wrong1==10:
                        print "Not printing any more predicted=1 mismatches"
                    elif p==1.0 and wrong1<10:
                        print msg

                    if p==0.0:
                        wrong0 += 1
                    elif p==1.0:
                        wrong1 += 1

                    wrong += 1

            print "wrong0:", wrong0
            print "wrong1:", wrong1
            print "\nTotal wrong:", wrong
            print "Total:", len(originalOutput)
            pctWrong = (100.0 * wrong)/len(originalOutput)
            print "wrong/Total * 100 ", pctWrong
            # I looked at what h2o can do for modelling with binomial and it should get better than 25% error?
            if pctWrong > 16.0:
                raise Exception("pct wrong: %s too high. Expect < 16 pct error" % pctWrong)
Exemple #34
0
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        model_key = kmeans["model"]["_selfKey"]
        # Exception: rjson error in inspect: Argument 'src_key' error: benign_k.hex:Key is not a Frame

        # can't use inspect on a model key? now?
        kmeansResult = kmeans
        model = kmeansResult["model"]
        centers = model["clusters"]
        error = model["error"]
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # have to figure out how to get this with fvec
        sqr_error_per_cluster = [0 for h in hcnt]

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", centers[i]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
Exemple #35
0
    def test_parse_cust(self):
        # run as user 0xcustomer to get access (with .json config and ssh key file specified)
        importFolderPath = '/mnt/0xcustomer-datasets'
        pollTimeoutSecs = 120
        retryDelaySecs = 30
        timeoutSecs = 300
        
        (importResult, importPattern) = h2i.import_only(path=importFolderPath + "/*")
        importFileList = importResult['files']
        importFailList = importResult['fails']
        importKeyList = importResult['keys']
        importDelList = importResult['dels']

        if len(importDelList)!=0:
            raise Exception("import shouldn't have any deletes. importDelList: %s" % h2o.dump_json(importDelList))

        if len(importFileList)<MINFILES:
            raise Exception("Didn't import successfully. importFileList: %s" % h2o.dump_json(importFileList))

        if len(importKeyList)<MINFILES:
            raise Exception("Didn't import successfully. importKeyList: %s" % h2o.dump_json(importKeyList))

        if len(importFailList)!=0:
            raise Exception("Didn't import successfully. importFailList: %s" % h2o.dump_json(importFailList))


        # only parse files with .csv or .tsv in their name (no dirs like that?)
        goodKeyList = [key for key in importKeyList if ('.csv' in key  or '.tsv' in key)]
        trial = 0
        # just do 1?
        for i, importKey in enumerate(random.sample(goodKeyList,3)):
            print "importKey:", importKey
            trial +=1

            start = time.time() 
            # some data has ,, in the header row. can't have multiple NAs. h2o doesn't like
            # force header=0..should mean headers get treated as NAs
            parseResult = h2i.parse_only(pattern=importKey, header=0,
                timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs)
            elapsed = time.time() - start
            print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "Parse result['destination_key']:", parseResult['destination_key']

            origKey = parseResult['destination_key']
            inspect = h2o_cmd.runInspect(key=origKey)
            h2o_cmd.infoFromInspect(inspect, origKey)

            execExpr = 'newKey = '+origKey+'[1,1]'
            h2e.exec_expr(h2o.nodes[0], execExpr, "a", timeoutSecs=30)
            newParseKey = {'destination_key': 'newKey'}

            h2o_cmd.checkKeyDistribution()
            h2o.nodes[0].remove_key(key=origKey)
            # a key isn't created for a scalar
            # h2o.nodes[0].remove_key(key='newKey')
        
        self.assertGreater(trial, MINDONE-1, msg="There should be more than %s parsed files" % MINDONE)
Exemple #36
0
    def test_parse_bounds_libsvm(self):
        # just do the import folder once

        # make the timeout variable per dataset. it can be 10 secs for covtype 20x (col key creation)
        # so probably 10x that for covtype200
        csvFilenameList = [
            ("mnist_train.svm", "cM", 30, 1),
            # FIX! fails KMeansScore
            # not integer output
            # ("colon-cancer.svm",   "cA", 30, 1),
            ("connect4.svm",       "cB", 30, 1),
            ("syn_6_1000_10.svm",  "cK", 30, 1),
            ("syn_0_100_1000.svm", "cL", 30, 1),
            ("mushrooms.svm",      "cG", 30, 1),
            ("duke.svm",           "cD", 30, 1),
            # too many features? 150K inspect timeout?
            # ("E2006.train.svm",    "cE", 30, 1),
            ("gisette_scale.svm",  "cF", 30, 1),
            ("news20.svm",         "cH", 30, 1),

            ("tmc2007_train.svm",  "cJ", 30, 1),
            ("covtype.binary.svm", "cC", 30, 1),
            # normal csv
        ]

        ### csvFilenameList = random.sample(csvFilenameAll,1)
        # h2b.browseTheCloud()
        lenNodes = len(h2o.nodes)

        firstDone = False
        for (csvFilename, hex_key, timeoutSecs, resultMult) in csvFilenameList:
            # have to import each time, because h2o deletes source after parse
            bucket = "home-0xdiag-datasets"
            csvPathname = "libsvm/" + csvFilename

            # PARSE******************************************
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000)
            print csvPathname, 'parse time:', parseResult['response']['time']
            print "Parse result['destination_key']:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvFilename)

            # RF******************************************
            kwargs = {
                'ntree': 6,
                'response_variable': 0,
            }

            timeoutSecs = 600
            start = time.time()
            rf = h2o_cmd.runRFOnly(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            elapsed = time.time() - start
            print "rf end on ", csvPathname, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
Exemple #37
0
def bigCheckResults(self, kmeans, csvPathname, parseResult, applyDestinationKey, **kwargs):
    simpleCheckKMeans(self, kmeans, **kwargs)
    if h2o.beta_features:
        # can't use inspect on a model key? now?
        model = kmeans["model"]
        model_key = model["_key"]
        centers = model["centers"]
        cluster_variances = model["within_cluster_variances"]
        error = model["total_within_SS"]
        kmeansResult = kmeans
    else:
        model_key = kmeans["destination_key"]
        kmeansResult = h2o_cmd.runInspect(key=model_key)
        h2o.verboseprint("kmeans result:", h2o.dump_json(kmeansResult))
        model = kmeansResult["KMeansModel"]
        centers = model["clusters"]
        error = model["error"]

    if h2o.beta_features:
        # need to use Predict2?
        pass
        # no scoring on Kmeans2?..just reuse
        # cols/max_ncols params?
        predictKey = applyDestinationKey
        predictResult = h2o.nodes[0].generate_predictions(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=predictKey
        )
        summaryResult = h2o.nodes[0].summary_page(key=predictKey)
        hcnt = summaryResult["summaries"][0]["hcnt"]  # histogram
        rows_per_cluster = hcnt
        # FIX! does the cluster order/naming match, compared to cluster variances
        sqr_error_per_cluster = cluster_variances

    else:
        kmeansApplyResult = h2o.nodes[0].kmeans_apply(
            data_key=parseResult["destination_key"], model_key=model_key, destination_key=applyDestinationKey
        )
        inspect = h2o_cmd.runInspect(None, applyDestinationKey)
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # this was failing
        summaryResult = h2o_cmd.runSummary(key=applyDestinationKey)
        h2o_cmd.infoFromSummary(summaryResult, noPrint=False)

        kmeansScoreResult = h2o.nodes[0].kmeans_score(key=parseResult["destination_key"], model_key=model_key)
        score = kmeansScoreResult["score"]
        rows_per_cluster = score["rows_per_cluster"]
        sqr_error_per_cluster = score["sqr_error_per_cluster"]

    tupleResultList = []
    print "\nerror: ", error
    for i, c in enumerate(centers):
        print "\ncenters[" + str(i) + "]: ", [round(c, 2) for c in centers[i]]
        print "rows_per_cluster[" + str(i) + "]: ", rows_per_cluster[i]
        print "sqr_error_per_cluster[" + str(i) + "]: ", sqr_error_per_cluster[i]
        tupleResultList.append((centers[i], rows_per_cluster[i], sqr_error_per_cluster[i]))

    return (centers, tupleResultList)
Exemple #38
0
    def test_parse_manyfiles_1(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirname = "manyfiles-nflx-gz"
        timeoutSecs = 600
        trial = 0
        for iteration in range(ITERATIONS):
            
            csvFilename = "file_1.dat.gz"
            csvPathname = csvDirname + "/" + csvFilename
            trialStart = time.time()
            # PARSE****************************************
            hex_key =  csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema=SCHEMA, hex_key=hex_key,
                delete_on_done=DELETE_ON_DONE, 
                # importParentDir=IMPORT_PARENT_DIR,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120, doSummary=False)
            elapsed = time.time() - start
            print "parse", trial, "end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            numRows = inspect['numRows']
            numCols = inspect['numCols']
            self.assertEqual(numCols, 542)
            self.assertEqual(numRows, 100000)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            # goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            for node in h2o.nodes:
                h2o_cmd.runStoreView(node=node, timeoutSecs=30, view=10000)

            # convert to binomial
            if DO_EXEC:
                execExpr="A.hex=%s" % parseResult['destination_key']
                h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

                # execExpr = 'A.hex[,378+1]=(A.hex[,378+1]>15)'
                # h2e.exec_expr(execExpr=execExpr, timeoutSecs=20)

            if DO_DELETE_MYSELF:
                h2o_import.delete_keys_at_all_nodes()

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_500_cols_fvec(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000, 500, 'cA', 1800, 1800),
        ]

        h2b.browseTheCloud()
        for (rowCount, colCount, orig_hex_key, timeoutSecs,
             timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            # create sym links
            multifile = 1000
            # there is already one file. assume it's the "0" case
            for p in range(1, multifile):
                csvPathnameLink = csvPathname + "_" + str(p)
                os.symlink(csvFilename, csvPathnameLink)

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            for trial in range(10):
                hex_key = orig_hex_key + str(trial)
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname + "*",
                                               schema='local',
                                               hex_key=hex_key,
                                               delete_on_done=1,
                                               timeoutSecs=timeoutSecs,
                                               doSummary=False)
                print "Parse:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"

                start = time.time()
                inspect = h2o_cmd.runInspect(None,
                                             parseResult['destination_key'],
                                             timeoutSecs=timeoutSecs2)
                print "Inspect:", parseResult[
                    'destination_key'], "took", time.time() - start, "seconds"
                h2o_cmd.infoFromInspect(inspect, csvPathname)
                print "\n" + csvPathname, \
                    "    numRows:", "{:,}".format(inspect['numRows']), \
                    "    numCols:", "{:,}".format(inspect['numCols'])

                # should match # of cols in header or ??
                self.assertEqual(
                    inspect['numCols'], colCount,
                    "parse created result with the wrong number of cols %s %s"
                    % (inspect['numCols'], colCount))
                self.assertEqual(inspect['numRows'], rowCount * multifile,
                    "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                    (inspect['numRows'], rowCount * multifile))
Exemple #40
0
    def test_fp_many_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()

        if H2O_SUPPORTS_OVER_50K_COLS:
            tryList = [
                (100, 200000, 'cG', 120, 120),
                (100, 300000, 'cH', 120, 120),
                (100, 400000, 'cI', 120, 120),
                (100, 500000, 'cJ', 120, 120),
                (100, 700000, 'cL', 120, 120),
                (100, 800000, 'cM', 120, 120),
                (100, 900000, 'cN', 120, 120),
                (100, 1000000, 'cO', 120, 120),
                (100, 1200000, 'cK', 120, 120),
            ]
        else:
            print "Restricting number of columns tested to 50,000"
            tryList = [
                (100, 200000, 'cG', 400, 400),
                (100, 300000, 'cH', 400, 400),
                (100, 400000, 'cI', 400, 400),
                (100, 500000, 'cJ', 400, 400),
            ]


        
        for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)
            sel = 0
            csvFilename = "syn_%s_%s_%s_%s.csv" % (SEEDPERFILE, sel, rowCount, colCount)
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel)

            start = time.time()
            print csvFilename, "parse starting"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False)
            h2o.check_sandbox_for_errors()
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs2)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))
    def test_parse_summary_manyfiles_s3_fvec(self):
        h2o.beta_features = True
        # these will be used as directory imports/parse
        csvDirlist = [("manyfiles-nflx-gz", 800)]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            # change to 50 files
            csvPathname = csvDirname + "/file_[2][0-4][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(
                bucket="home-0xdiag-datasets", path=csvPathname, schema="s3", timeoutSecs=timeoutSecs
            )

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets",
                path=csvPathname,
                schema="s3",
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                retryDelaySecs=10,
                pollTimeoutSecs=120,
            )
            elapsed = time.time() - start
            print "parse end on ", parseResult["destination_key"], "took", elapsed, "seconds", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs
            )

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"], timeoutSecs=360)
            print "Inspect:", parseResult["destination_key"], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult["destination_key"], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_parse_header_rows_mismatch(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        # headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON"
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL"

        totalCols = 8
        totalRows = 10000
        rList = rand_rowData(totalCols)
        write_syn_dataset(csvPathname, totalRows, headerData, rList)

        for trial in range(2):
            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            key = csvFilename + "_" + str(trial)
            key2 = csvFilename + "_" + str(trial) + ".hex"

            start = time.time()
            timeoutSecs = 30
            print "Force it to think there's a header. using comma forced as separator"
            parseKey = h2o_cmd.parseFile(csvPathname=csvPathname,
                                         key=key,
                                         key2=key2,
                                         timeoutSecs=timeoutSecs,
                                         pollTimeoutSecs=30,
                                         header=1,
                                         separator=44)
            print "parseKey['destination_key']: " + parseKey['destination_key']
            print 'parse time:', parseKey['response']['time']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])
            # should match # of cols in header or ??
            self.assertEqual(
                inspect['num_cols'], totalCols,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['num_cols'], totalCols))
            self.assertEqual(
                inspect['num_rows'], totalRows,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect['num_rows'], totalRows))

            kwargs = {'sample': 75, 'depth': 25, 'ntree': 1}
            start = time.time()
            rfv = h2o_cmd.runRFOnly(parseKey=parseKey,
                                    timeoutSecs=30,
                                    **kwargs)
            elapsed = time.time() - start
            print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)
            print "trial #", trial, "totalRows:", totalRows, "parse end on ", csvFilename, \
                'took', time.time() - start, 'seconds'

            h2o.check_sandbox_for_errors()
    def test_cols_enum_multi_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ]
        tryList = [
            (300, 100, 'cA', 60, '*x[2-5]*'),
            (310, 200, 'cB', 60, '*x[1,3-5]*'),
            (320, 300, 'cC', 60, '*x[1-2,4-5]*'),
            (330, 400, 'cD', 60, '*x[1-3-5]*'),
            (340, 500, 'cE', 60, '*x[1-4]*'),
        ]

        h2b.browseTheCloud()
        cnum = 0
        # create them all first
        for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            for fileN in range(FILENUM):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount, SEED,
                                  translateList)

        for (rowCount, colCount, key2, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            h2o.nodes[0].import_files(SYNDATASETS_DIR)
            # pattern match all, then use exclude
            parseKey = h2o.nodes[0].parse('*',
                                          key2=key2,
                                          exclude=excludePattern,
                                          header=1,
                                          timeoutSecs=timeoutSecs)
            print "parseKey['destination_key']: " + parseKey['destination_key']
            print 'parse time:', parseKey['response']['time']

            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # FIX! h2o strips one of the headers, but treats all the other files with headers as data
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            print "\n" + parseKey['destination_key'] + ":", \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            # all should have rowCount rows (due to the excludePattern
            self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \
                (num_rows, rowCount, FILENUM)))
Exemple #44
0
    def test_storeview_import(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        importFolderPath = "standard"
        csvFilelist = [
            ("covtype.data", 300),
        ]

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            csvPathname = importFolderPath + "/" + csvFilename
            trialStart = time.time()

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            print "parse start on:", csvFilename
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                hex_key=hex_key, timeoutSecs=timeoutSecs)
            elapsed = time.time() - start
            print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
            print "parse result:", parseResult['destination_key']

            # INSPECT******************************************
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # SUMMARY****************************************
            # gives us some reporting on missing values, constant values, 
            # to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=0,
                key=parseResult['destination_key'], timeoutSecs=300)
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult, noPrint=True)

            # STOREVIEW***************************************
            print "Trying StoreView to all nodes after the parse"
            
            for n, node in enumerate(h2o.nodes):
                print "\n*****************"
                print "StoreView node %s:%s" % (node.http_addr, node.port)
                storeViewResult = h2o_cmd.runStoreView(node, timeoutSecs=30)
                f = open(SYNDATASETS_DIR + "/storeview_" + str(n) + ".txt", "w" )
                result = h2o.dump_json(storeViewResult)
                f.close()
                lastStoreViewResult = storeViewResult
            

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Exemple #45
0
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs):
    csvPathname = SYNDATASETS_DIR + '/' + csvFilename
    print "Creating random", csvPathname
    write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

    csvFilenamegz = csvFilename + ".gz"
    csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz
    h2o_util.file_gzip(csvPathname, csvPathnamegz)

    csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz"
    csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz
    print "Replicating", csvFilenamegz, "into", csvFilenameReplgz

    start = time.time()
    h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz)
    # no header? should we add a header? would have to be a separate gz?
    totalRows = 2 * rowCount
    for i in range(FILEREPL-2):
        h2o_util.file_append(csvPathnamegz, csvPathnameReplgz)
        totalRows += rowCount
    print "Replication took:", time.time() - start, "seconds"

    start = time.time()
    print "Parse start:", csvPathnameReplgz
    doSummary = False
    parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary)
    print csvFilenameReplgz, 'parse time:', parseKey['response']['time']
    if doSummary:
        algo = "Parse and Summary:"
    else:
        algo = "Parse:"
    print algo , parseKey['destination_key'], "took", time.time() - start, "seconds"

    print "Inspecting.."
    start = time.time()
    inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs)
    print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
    h2o_cmd.infoFromInspect(inspect, csvPathname)
    print "\n" + csvPathname, \
        "    num_rows:", "{:,}".format(inspect['num_rows']), \
        "    num_cols:", "{:,}".format(inspect['num_cols'])

    # there is an extra response variable
    if inspect['num_cols'] != (colCount + 1):
        raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
    if inspect['num_rows'] != totalRows:
        raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
        (inspect['num_rows'], rowCount))

    # hack it in! for test purposees only
    parseKey['python_source_key'] = csvFilenameReplgz
    parseKey['num_rows'] = inspect['num_rows']
    parseKey['num_cols'] = inspect['num_cols']
    parseKey['value_size_bytes'] = inspect['value_size_bytes']
    return parseKey
Exemple #46
0
    def test_short(self):
            csvFilename = 'part-00000b'
            ### csvFilename = 'short'
            importFolderPath = '/home/hduser/data'
            importFolderResult = h2i.setupImportFolder(None, importFolderPath)
            csvPathname = importFolderPath + "/" + csvFilename

            # FIX! does 'separator=' take ints or ?? hex format
            # looks like it takes the hex string (two chars)
            start = time.time()
            # hardwire TAB as a separator, as opposed to white space (9)
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath, 
                timeoutSecs=500, separator=9)
            print "Parse of", parseKey['destination_key'], "took", time.time() - start, "seconds"

            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse result['destination_key']:", parseKey['destination_key']

            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=500)
            print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            # num_rows = inspect['num_rows']
            # num_cols = inspect['num_cols']

            keepPattern = "oly_|mt_|b_"
            y = "is_purchase"
            print "y:", y
            # don't need the intermediate Dicts produced from columnInfoFromInspect
            x = h2o_glm.goodXFromColumnInfo(y, keepPattern=keepPattern, key=parseKey['destination_key'], timeoutSecs=300)
            print "x:", x

            kwargs = {
                'x': x, 
                'y': y,
                # 'case_mode': '>',
                # 'case': 0,
                'family': 'binomial',
                'lambda': 1.0E-5,
                'alpha': 0.5,
                'max_iter': 5,
                'thresholds': 0.5,
                'n_folds': 1,
                'weight': 100,
                'beta_epsilon': 1.0E-4,
                }

            timeoutSecs = 1800
            start = time.time()
            glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
            elapsed = time.time() - start
            print "glm completed in", elapsed, "seconds.", \
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
Exemple #47
0
    def test_exec2_row_range(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [(1000000, 5, "cA", 200)]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = "syn_" + str(SEEDPERFILE) + "_" + str(rowCount) + "x" + str(colCount) + ".csv"
            csvPathname = SYNDATASETS_DIR + "/" + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(
                path=csvPathname, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False
            )
            print "Parse:", parseResult["destination_key"], "took", time.time() - start, "seconds"

            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )

            # should match # of cols in header or ??
            self.assertEqual(
                inspect["numCols"],
                colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect["numCols"], colCount),
            )
            self.assertEqual(
                inspect["numRows"],
                rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s"
                % (inspect["numRows"], rowCount),
            )

            REPEAT = 1
            for i in range(REPEAT):
                hex_key_i = hex_key + "_" + str(i)
                execExpr = "%s=%s[1,]" % (hex_key_i, hex_key)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, 100)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, rowCount - 10)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                inspect = h2o_cmd.runInspect(None, hex_key_i, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromInspect(inspect, hex_key_i)
                print "\n" + hex_key_i, "    numRows:", "{:,}".format(
                    inspect["numRows"]
                ), "    numCols:", "{:,}".format(inspect["numCols"])
    def test_parse_summary_airline_s3(self):
        csvFilelist = [
            ("allyears2k.csv",   300), #4.4MB
            ("year1987.csv",     600), #130MB
            ("allyears.csv",     900), #12GB
            # ("allyears_10.csv", 1800), #119.98GB
        ]

        bucket = 'h2o-airlines-unpacked'
        (importHDFSResult, importPattern) = h2i.import_only(bucket=bucket, path='*', schema='s3')
        s3nFullList = importHDFSResult['succeeded']
        self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

        print "\nTrying StoreView after the import s3"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            # this is schema='local'k
            parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y='IsArrDelayed', key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
Exemple #49
0
    def test_rf_hhp_2a_fvec(self):
        h2o.beta_features = True
        csvFilenameList = {
            'hhp.cut3.214.data.gz',
            }

        for csvFilename in csvFilenameList:
            csvPathname = csvFilename
            print "RF start on ", csvPathname
            dataKeyTrain = 'rTrain.hex'
            start = time.time()
            parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, hex_key=dataKeyTrain, schema='put',
                timeoutSecs=120)            
            inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            numCols = inspect['numCols']

            # we want the last col. Should be values 0 to 14. 14 most rare

            # from the cut3 set
            #   84777 0
            #   13392 1
            #    6546 2
            #    5716 3
            #    4210 4
            #    3168 5
            #    2009 6
            #    1744 7
            #    1287 8
            #    1150 9
            #    1133 10
            #     780 11
            #     806 12
            #     700 13
            #     345 14
            #    3488 15

            execExpr = "%s[,%s] = %s[,%s]==14" % (dataKeyTrain, numCols, dataKeyTrain, numCols)
            h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTrain, timeoutSecs=10)
            inspect = h2o_cmd.runInspect(key=dataKeyTrain)
            h2o_cmd.infoFromInspect(inspect, "going into RF")
            execResult = {'destination_key': dataKeyTrain}


            kwargs = {
                'ntrees': 20,
                'max_depth': 20,
                'nbins': 50,
            }
            rfView = h2o_cmd.runRF(parseResult=execResult, timeoutSecs=900, retryDelaySecs=10, **kwargs)
            print "RF end on ", csvPathname, 'took', time.time() - start, 'seconds'
            (error, classErrorPctList, totalScores) = h2o_rf.simpleCheckRFView(rfv=rfView)
    def test_parse_summary_zip_s3_fvec(self):
        h2o.beta_features = True
        csvFilelist = [
            ("test_set.zip", 300),  # 110.9MB
            ("train_set.zip", 600),  # 362.9MB
        ]

        (importResult, importPattern) = h2i.import_only(bucket='h2o-datasets',
                                                        path="allstate",
                                                        schema='s3')

        print "\nTrying StoreView after the import hdfs"
        h2o_cmd.runStoreView(timeoutSecs=120)

        trial = 0
        for (csvFilename, timeoutSecs) in csvFilelist:
            trialStart = time.time()
            csvPathname = csvFilename

            # PARSE****************************************
            csvPathname = "allstate/" + csvFilename
            hex_key = csvFilename + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='h2o-datasets',
                                           path=csvPathname,
                                           schema='s3',
                                           hex_key=hex_key,
                                           timeoutSecs=timeoutSecs,
                                           retryDelaySecs=10,
                                           pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseResult['destination_key'],
                                         timeoutSecs=360)
            print "Inspect:", parseResult[
                'destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time(
            ) - trialStart, "seconds."
            trial += 1
    def test_loop_random_param_covtype(self):
        csvPathname = h2o.find_file('smalldata/poisson/Goalies.csv')
        parseKey = h2o_cmd.parseFile(csvPathname=csvPathname)
        inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])

        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)

        # for determinism, I guess we should spit out the seed?
        # random.seed(SEED)
        SEED = random.randint(0, sys.maxint)
        # if you have to force to redo a test
        # SEED =
        random.seed(SEED)
        paramDict = define_params()
        print "\nUsing random seed:", SEED
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'y': 5,
                'n_folds': 1,
                'family': "poisson",
                'alpha': 0.0,
                'lambda': 0,
                'beta_epsilon': 0.001,
                'max_iter': 3,
                'standardize': 1,
                'expert': 1,
                'lsm_solver': 'GenGradient',
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds'] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLMOnly(timeoutSecs=timeoutSecs,
                                     parseKey=parseKey,
                                     **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            start = time.time()
            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "simpleCheckGLM end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'
            print "Trial #", trial, "completed\n"
 def test_exec2_constants(self):
     print "Create some vectors from a constant"
     print "Don't really need a dataset, but .."
     for i in range(10):
         h2e.exec_zero_list(zeroList)
         inspect = h2o_cmd.runInspect(key='Result9')
         h2o_cmd.infoFromInspect(inspect, 'Result9')
         numRows = inspect['numRows']
         numCols = inspect['numCols']
         self.assertEqual(numRows, 1000000)
         self.assertEqual(numCols, 1)
    def test_cols_enum_multi_import(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        translateList = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u']
        tryList = [
            (300, 100, 'cA', 60, '*x[2-5]*'),
            (310, 200, 'cB', 60, '*x[1,3-5]*'),
            (320, 300, 'cC', 60, '*x[1-2,4-5]*'),
            (330, 400, 'cD', 60, '*x[1-3-5]*'),
            (340, 500, 'cE', 60, '*x[1-4]*'),
            ]

        ## h2b.browseTheCloud()
        cnum = 0
        # create them all first
        for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", FILENUM, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            for fileN in range(FILENUM):
                csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                write_syn_dataset(csvPathname, rowCount, colCount, SEED, translateList)

        for (rowCount, colCount, hex_key, timeoutSecs, excludePattern) in tryList:
            cnum += 1
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                print f
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f)

            # pattern match all, then use exclude
            parseResult = h2i.parse_only(pattern="*/syn_*",
                hex_key=hex_key, exclude=excludePattern, header=1, timeoutSecs=timeoutSecs)
            print "parseResult['destination_key']: " + parseResult['destination_key']
            print 'parse time:', parseResult['response']['time']

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)


            # FIX! h2o strips one of the headers, but treats all the other files with headers as data
            num_rows = inspect['num_rows']
            num_cols = inspect['num_cols']
            print "\n" + parseResult['destination_key'] + ":", \
                "    num_rows:", "{:,}".format(num_rows), \
                "    num_cols:", "{:,}".format(num_cols)

            # all should have rowCount rows (due to the excludePattern
            self.assertEqual(num_rows, rowCount*FILENUM, msg=("got num_rows: %s. Should be rowCount: %s * FILENUM: %s" % \
                (num_rows, rowCount, FILENUM)))
Exemple #54
0
    def test_parse_1m_cols(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            # (10, 700000, 'cA', 30, 60),
            # (10, 800000, 'cB', 30, 70),
            # (10, 900000, 'cC', 30, 80),
            (10, 1000000, 'cD', 60, 360),
            # (10, 1100000, 'cE', 60, 100),
            # (10, 1200000, 'cF', 60, 120),
        ]

        ### h2b.browseTheCloud()
        for (rowCount, colCount, key2, timeoutSecs, timeoutSecs2) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(
                rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseKey = h2o_cmd.parseFile(None,
                                         csvPathname,
                                         key2=key2,
                                         timeoutSecs=timeoutSecs,
                                         doSummary=False)
            print csvFilename, 'parse time:', parseKey['response']['time']
            print "Parse:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None,
                                         parseKey['destination_key'],
                                         timeoutSecs=timeoutSecs2)
            print "Inspect:", parseKey['destination_key'], "took", time.time(
            ) - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            # should match # of cols in header or ??
            self.assertEqual(
                inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" %
                (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))
    def test_exec2_row_range(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (1000000, 5, 'cA', 200),
            ]

        # h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "\nCreating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=False)
            print "Parse:", parseResult['destination_key'], "took", time.time() - start, "seconds"

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['numCols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount))
            self.assertEqual(inspect['numRows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['numRows'], rowCount))

            REPEAT = 1
            for i in range(REPEAT):
                hex_key_i = hex_key + "_" + str(i)
                execExpr = "%s=%s[1,]" % (hex_key_i, hex_key)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, 100)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

                execExpr = "%s=%s[1:%s,]" % (hex_key_i, hex_key, rowCount-10)
                resultExec, result = h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)
                inspect = h2o_cmd.runInspect(None, hex_key_i, timeoutSecs=timeoutSecs)
                h2o_cmd.infoFromInspect(inspect, hex_key_i)
                print "\n" + hex_key_i, \
                    "    numRows:", "{:,}".format(inspect['numRows']), \
                    "    numCols:", "{:,}".format(inspect['numCols'])
Exemple #56
0
    def test_parse_65k_cols_01(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        tryList = [
            (10, 63000, 'cH', 100),
            (10, 65000, 'cH', 100),
            ]

        h2b.browseTheCloud()
        for (rowCount, colCount, hex_key, timeoutSecs) in tryList:
            SEEDPERFILE = random.randint(0, sys.maxint)

            csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv'
            csvPathname = SYNDATASETS_DIR + '/' + csvFilename

            print "Creating random", csvPathname
            write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE)

            start = time.time()
            print "Summary should work with 65k"
            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, 
                timeoutSecs=timeoutSecs, doSummary=True)
            print csvFilename, 'parse time:', parseResult['response']['time']
            print "Parse and summary:", parseResult['destination_key'], "took", time.time() - start, "seconds"

            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            # should match # of cols in header or ??
            self.assertEqual(inspect['num_cols'], colCount,
                "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount))
            self.assertEqual(inspect['num_rows'], rowCount,
                "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
                (inspect['num_rows'], rowCount))

            

            # we should obey max_column_display
            column_limits = [25, 25000]
            for column_limit in column_limits:
                inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], max_column_display=column_limit, timeoutSecs=timeoutSecs)
                self.assertEqual(len( inspect['cols'] ) , column_limit, "inspect obeys max_column_display = " + str(column_limit))
                for r in range(0, len( inspect[ 'rows' ] )):
                    # NB: +1 below because each row includes a row header row: #{row}
                    self.assertEqual(len( inspect['rows'][r] ) , column_limit + 1, "inspect data rows obeys max_column_display = " + str(column_limit))
    def test_parse_summary_manyfiles_s3n(self):
        # these will be used as directory imports/parse
        csvDirlist = [
            ("manyfiles-nflx-gz",   600),
        ]
        trial = 0
        for (csvDirname, timeoutSecs) in csvDirlist:

            csvPathname = csvDirname + "/file_[2][0-9][0-9].dat.gz"
            (importHDFSResult, importPattern) = h2i.import_only(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', timeoutSecs=timeoutSecs)
            s3nFullList = importHDFSResult['succeeded']
            self.assertGreater(len(s3nFullList),1,"Should see more than 1 files in s3n?")

            print "\nTrying StoreView after the import hdfs"
            h2o_cmd.runStoreView(timeoutSecs=120)

            trialStart = time.time()
            # PARSE****************************************
            hex_key = csvDirname + "_" + str(trial) + ".hex"
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='s3n', hex_key=hex_key,
                timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=120)
            elapsed = time.time() - start
            print "parse end on ", parseResult['destination_key'], 'took', elapsed, 'seconds',\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            # INSPECT******************************************
            # We should be able to see the parse result?
            start = time.time()
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=360)
            print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
            h2o_cmd.infoFromInspect(inspect, csvPathname)

            # gives us some reporting on missing values, constant values, to see if we have x specified well
            # figures out everything from parseResult['destination_key']
            # needs y to avoid output column (which can be index or name)
            # assume all the configs have the same y..just check with the firs tone
            goodX = h2o_glm.goodXFromColumnInfo(y=54, key=parseResult['destination_key'], timeoutSecs=300)

            # SUMMARY****************************************
            summaryResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=360)
            h2o_cmd.infoFromSummary(summaryResult)

            # STOREVIEW***************************************
            print "\nTrying StoreView after the parse"
            h2o_cmd.runStoreView(timeoutSecs=120)

            print "Trial #", trial, "completed in", time.time() - trialStart, "seconds."
            trial += 1
    def test_GLM2_binomial_goalies(self):
        h2o.beta_features = True
        csvPathname = 'poisson/Goalies.csv'
        print "\nParsing", csvPathname
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname,
                                       schema='put',
                                       hex_key="A.hex")
        inspect = h2o_cmd.runInspect(None, "A.hex")
        # need more info about the dataset for debug
        h2o_cmd.infoFromInspect(inspect, csvPathname)
        case = 20
        execExpr = "A.hex[,%s]=(A.hex[,%s]>%s)" % (6 + 1, 6 + 1, case)
        h2e.exec_expr(execExpr=execExpr, timeoutSecs=30)

        paramDict = define_params()
        for trial in range(5):
            # params is mutable. This is default.
            # FIX! does it never end if we don't have alpha specified?
            params = {
                'response': 6,
                'n_folds': 1,
                'family': "binomial",
                'alpha': 0,
                # seems we always need a little regularization
                'lambda': 1e-4,
                'beta_epsilon': 0.001,
                'max_iter': 8
            }

            colX = h2o_glm.pickRandGlmParams(paramDict, params)
            kwargs = params.copy()

            # make timeout bigger with xvals
            timeoutSecs = 180 + (kwargs['n_folds'] * 30)
            # or double the 4 seconds per iteration (max_iter+1 worst case?)
            timeoutSecs = max(timeoutSecs, (8 * (kwargs['max_iter'] + 1)))

            start = time.time()
            print "May not solve. Expanded categorical columns causing a large # cols, small # of rows"
            glm = h2o_cmd.runGLM(timeoutSecs=timeoutSecs,
                                 parseResult={'destination_key': 'A.hex'},
                                 **kwargs)
            elapsed = time.time() - start
            print "glm end on ", csvPathname, "Trial #", trial, "completed in", elapsed, "seconds.",\
                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

            h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)
            print "Trial #", trial, "completed\n"