def test_KMeansGrid_params_rand2_fvec(self):
        if h2o.localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ("covtype.data", 800)
            ]
        else:
            csvFilenameList = [("covtype.data", 800)]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(
                bucket="home-0xdiag-datasets", path=csvPathname, timeoutSecs=2000, pollTimeoutSecs=60
            )
            inspect = h2o_cmd.runInspect(None, parseResult["destination_key"])
            print "\n" + csvPathname, "    numRows:", "{:,}".format(inspect["numRows"]), "    numCols:", "{:,}".format(
                inspect["numCols"]
            )

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + ".hex"
                params = {"k": "2,3", "destination_key": destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(
                    parseResult=parseResult,
                    timeoutSecs=timeoutSecs,
                    retryDelaySecs=2,
                    pollTimeoutSecs=60,
                    noPoll=True,
                    **kwargs
                )
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans end on ", csvPathname, "took", elapsed, "seconds.", "%d pct. of timeout" % (
                    (elapsed / timeoutSecs) * 100
                )
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeansGrid_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + '.hex'
                params = {'k': '2,3', 'destination_key': destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs,
                                 pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Example #3
0
    def test_KMeans_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype20x.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype20x.data', 800),
            ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            parseKey = h2i.parseImportFolderFile(None,
                                                 csvFilename,
                                                 importFolderPath,
                                                 timeoutSecs=2000,
                                                 pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'k': 1,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeansOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,
                                             key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeansGrid_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            h2o.beta_features = True # no grid for VA
            for trial in range(3):
                # default
                destinationKey = csvFilename + "_" + str(trial) + '.hex'
                params = {'k': 'c(2,3)', 'destination_key': destinationKey}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()
        
                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, noPoll=True, **kwargs)
                h2j.pollWaitJobs(timeoutSecs=timeoutSecs, pollTimeoutSecs=timeoutSecs)

                elapsed = time.time() - start
                print "FIX! how do we get results..need redirect_url"
                print "Have to inspect different models? (grid)"
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                # h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                # destination_key is ignored by kmeans...what are the keys for the results
                # inspect = h2o_cmd.runInspect(None,key=destinationKey)
                # print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
    def test_KMeans_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
            ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
            ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           timeoutSecs=2000,
                                           pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'max_iter': 20,
                    'k': 1,
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
    def test_KMeans_params_rand2_fvec(self):
        h2o.beta_features = True
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = "standard"
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            csvPathname = importFolderPath + "/" + csvFilename
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {
                    'max_iter': 20, 
                    'k': 1, 
                    'destination_key': csvFilename + "_" + str(trial) + '.hex'
                }
                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)

                print "Trial #", trial, "completed\n"
    def test_KMeans_params_rand2(self):
        if localhost:
            csvFilenameList = [
                # ('covtype.data', 60),
                ('covtype.data', 800),
                ]
        else:
            csvFilenameList = [
                ('covtype.data', 800),
                ]

        importFolderPath = '/home/0xdiag/datasets/standard'
        h2i.setupImportFolder(None, importFolderPath)
        for csvFilename, timeoutSecs in csvFilenameList:
            # creates csvFilename.hex from file in importFolder dir 
            parseKey = h2i.parseImportFolderFile(None, csvFilename, importFolderPath,
                timeoutSecs=2000, pollTimeoutSecs=60)
            inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
            csvPathname = importFolderPath + "/" + csvFilename
            print "\n" + csvPathname, \
                "    num_rows:", "{:,}".format(inspect['num_rows']), \
                "    num_cols:", "{:,}".format(inspect['num_cols'])

            paramDict = define_params(SEED)
            for trial in range(3):
                # default
                params = {'k': 1 }
                # 'destination_key': csvFilename + "_" + str(trial) + '.hex'}

                h2o_kmeans.pickRandKMeansParams(paramDict, params)
                kwargs = params.copy()

                start = time.time()
                kmeans = h2o_cmd.runKMeansGridOnly(parseKey=parseKey, \
                    timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
                elapsed = time.time() - start
                print "kmeans grid end on ", csvPathname, 'took', elapsed, 'seconds.', \
                    "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
                h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

                ### print h2o.dump_json(kmeans)
                inspect = h2o_cmd.runInspect(None,key=kmeans['destination_key'])
                print h2o.dump_json(inspect)

                print "Trial #", trial, "completed\n"
Example #8
0
    def test_KMeans_create_frame_fvec(self):
        for trial in range(20):

            cfParamDict = define_create_frame_params(SEED)
            # default
            params = {
                'rows': 5,
                'cols': 10
            }
            h2o_util.pickRandParams(cfParamDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)

            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None

            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0


            kwargs = params.copy()
            timeoutSecs = 300
            hex_key = 'temp_%s.hex' % trial
            cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
            inspect = h2o_cmd.runInspect(None, hex_key)
            print "\n%s" % hex_key, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            kmeansParamDict = define_KMeans_params(SEED)

            # default
            params = {
                'max_iter': 20, 
                'k': 1, 
                'destination_key': "KM_" + str(trial) + '.hex'
            }
            h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params)
            kwargs = params.copy()

            start = time.time()
            parseResult = {'destination_key': hex_key }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)

            print "Trial #", trial, "completed\n"