Ejemplo n.º 1
0
    def test_create_frame_rand1(self):
        h2o.beta_features = True
        # default
        params = {'rows': 1, 'cols': 1}
        for trial in range(20):
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None

            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0

            kwargs = params.copy()

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata',
                                           path='poker/poker1000',
                                           hex_key='temp1000.hex',
                                           schema='put',
                                           timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex',
                                                 timeoutSecs=timeoutSecs,
                                                 **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex',
                                          csvPathname=csvPathname,
                                          timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            h2o_cmd.runSummary(key='temp1000.hex', timeoutSecs=300)
            print h2o.dump_json(cfResult)

            print "Trial #", trial, "completed"
Ejemplo n.º 2
0
    def test_create_frame_rand1(self):
        h2o.beta_features = True
        # default
        params = {
            'rows': 1, 
            'cols': 1
        }
        for trial in range(20):
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strick checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:   
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None
                
            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0


            kwargs = params.copy()

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            h2o_cmd.runSummary(key='temp1000.hex')
            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
Ejemplo n.º 3
0
 def test_speedrf_params_rand2_fvec(self):
     h2o.beta_features = True
     csvPathname = 'standard/covtype.data'
     hex_key = 'covtype.data.hex'
     for trial in range(10):
         # params is mutable. This is default.
         # response is required for SpeeERF
         params = {
             'response': 'C55', 
             'ntrees': 1, 'mtries': 7, 
             'balance_classes': 0, 
             # never run with unconstrained balance_classes size if random sets balance_classes..too slow
             'max_after_balance_size': 2,
             'importance': 0}
         colX = h2o_util.pickRandParams(paramDict, params)
         if 'cols' in params and params['cols']:
             # exclusion
             if 'ignored_cols_by_name' in params:
                 params['ignored_cols_by_name'] = None
         else:
             if 'ignored_cols_by_name' in params and params['ignored_cols_by_name']:
                 params['mtries'] = random.randint(1,53)
             else:
                 params['mtries'] = random.randint(1,54)
             
         kwargs = params.copy()
         # adjust timeoutSecs with the number of trees
         timeoutSecs = 80 + ((kwargs['ntrees']*80) * max(1,kwargs['mtries']/60) )
         start = time.time()
         parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key)
         h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs)
         elapsed = time.time()-start
         print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
Ejemplo n.º 4
0
    def test_create_frame_rand1(self):
        h2o.beta_features = True
        # default
        params = {
            'rows': 1, 
            'cols': 1
        }
        for trial in range(10):
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', 0)
            c = params.get('categorical_fraction', 0)
            r = params.get('randomize', 0)
            v = params.get('value', None)
            if r:
                if v is not None:
                    # if these are None, they are treated as >0 (default > 0?)
                    params['integer_fraction'] = 0
                    params['categorical_fraction'] = 0
                elif (i and c) and (i + c) >= 1.0:
                    params['integer_fraction'] = i
                    params['categorical_fraction'] = 1.0 - i
            else:
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0
                params['value'] = None


            kwargs = params.copy()

            print kwargs
            timeoutSecs = 300
            parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', 
                schema='put', timeoutSecs=timeoutSecs)
            cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs)

            if DO_DOWNLOAD:
                csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv'
                h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60)

            if DO_INSPECT:
                h2o_cmd.runInspect(key='temp1000.hex')

            h2o_cmd.runSummary(key='temp1000.hex')
            print h2o.dump_json(cfResult)
    
            print "Trial #", trial, "completed"
Ejemplo n.º 5
0
    def test_bayes_and2(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                       path=csvPathname,
                                       schema='put')
        paramDict = define_params()
        for trial in range(1):
            response = 'C55'
            params = {
                'response': response,
            }

            colX = h2o_util.pickRandParams(paramDict, params)
            kwargs = params.copy()

            timeoutSecs = 120
            # chagne response to factor
            execExpr = 'covtype.hex[,54+1] = factor(covtype.hex[,54+1] != 5)'  # turn 7-class problem into binomial such that AUC can work below..
            resultExec, ncols = h2e.exec_expr(execExpr=execExpr)

            start = time.time()
            bayesResult = h2o.nodes[0].naive_bayes(timeoutSecs=timeoutSecs,
                                                   source='covtype.hex',
                                                   **kwargs)
            print "bayes end on ", csvPathname, 'took', time.time(
            ) - start, 'seconds'

            print "bayes result:", h2o.dump_json(bayesResult)

            nb_model = bayesResult['nb_model']
            ncats = nb_model['ncats']
            nnums = nb_model['nnums']
            pcond = nb_model['pcond']
            pprior = nb_model['pprior']
            rescnt = nb_model['rescnt']
            modelClassDist = nb_model['_modelClassDist']
            names = nb_model['_names']
            domains = nb_model['_domains']
            priorClassDist = nb_model['_priorClassDist']
            model_key = nb_model['_key']

            # is it an error to get std dev of 0 after predicting?
            print "Doing predict with same dataset, and the bayes model"
            h2o.nodes[0].generate_predictions(model_key=model_key,
                                              data_key='covtype.hex',
                                              prediction='Predict.hex')

            # just get a predict and AUC on the same data. has to be binomial result
            resultAUC = h2o.nodes[0].generate_auc(thresholds=None,
                                                  actual='covtype.hex',
                                                  predict='Predict.hex',
                                                  vactual=response,
                                                  vpredict=1)
            print "AUC result:", h2o.dump_json(resultAUC)

            print "Trial #", trial, "completed\n"
Ejemplo n.º 6
0
    def test_bayes_and2(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put')
        paramDict = define_params()
        for trial in range(1):
            response = 'C55'
            params = {
                'response': response, 
                }

            colX = h2o_util.pickRandParams(paramDict, params)
            kwargs = params.copy()

            timeoutSecs = 120
            # chagne response to factor
            execExpr = 'covtype.hex[,54+1] = factor(covtype.hex[,54+1] != 5)' # turn 7-class problem into binomial such that AUC can work below..
            resultExec, ncols = h2e.exec_expr(execExpr=execExpr)

            start = time.time()
            bayesResult = h2o.nodes[0].naive_bayes(timeoutSecs=timeoutSecs, source='covtype.hex', **kwargs)
            print "bayes end on ", csvPathname, 'took', time.time() - start, 'seconds'

            print "bayes result:", h2o.dump_json(bayesResult)

            nb_model = bayesResult['nb_model']
            ncats = nb_model['ncats']
            nnums = nb_model['nnums']
            pcond = nb_model['pcond']
            pprior = nb_model['pprior']
            rescnt = nb_model['rescnt']
            modelClassDist = nb_model['_modelClassDist']
            names = nb_model['_names']
            domains = nb_model['_domains']
            priorClassDist = nb_model['_priorClassDist']
            model_key = nb_model['_key']


            # is it an error to get std dev of 0 after predicting?
            print "Doing predict with same dataset, and the bayes model"
            h2o.nodes[0].generate_predictions(model_key=model_key, data_key='covtype.hex', prediction='Predict.hex')

            # just get a predict and AUC on the same data. has to be binomial result
            resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual='covtype.hex', predict='Predict.hex',
                vactual=response, vpredict=1)
            print "AUC result:", h2o.dump_json(resultAUC)


            print "Trial #", trial, "completed\n"
Ejemplo n.º 7
0
    def test_speedrf_params_rand2_fvec(self):
        h2o.beta_features = True
        csvPathname = 'standard/covtype.data'
        hex_key = 'covtype.data.hex'
        for trial in range(10):
            # params is mutable. This is default.
            # response is required for SpeeERF
            params = {
                'response': 'C55',
                'ntrees': 1,
                'mtries': 7,
                'balance_classes': 0,
                # never run with unconstrained balance_classes size if random sets balance_classes..too slow
                'max_after_balance_size': 2,
                'importance': 0
            }
            colX = h2o_util.pickRandParams(paramDict, params)
            if 'cols' in params and params['cols']:
                # exclusion
                if 'ignored_cols_by_name' in params:
                    params['ignored_cols_by_name'] = None
            else:
                if 'ignored_cols_by_name' in params and params[
                        'ignored_cols_by_name']:
                    params['mtries'] = random.randint(1, 53)
                else:
                    params['mtries'] = random.randint(1, 54)

            kwargs = params.copy()
            # adjust timeoutSecs with the number of trees
            timeoutSecs = 80 + (
                (kwargs['ntrees'] * 80) * max(1, kwargs['mtries'] / 60))
            start = time.time()
            parseResult = h2i.import_parse(bucket='home-0xdiag-datasets',
                                           path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            h2o_cmd.runSpeeDRF(parseResult=parseResult,
                               timeoutSecs=timeoutSecs,
                               retryDelaySecs=1,
                               **kwargs)
            elapsed = time.time() - start
            print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % (
                (elapsed * 100) / timeoutSecs)
    def test_create_rebalance_2enum(self):
        # default
        params = {'rows': 100, 'cols': 1}
        for trial in range(20):
            # CREATE FRAME params################################################################
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None

            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0

            # CREATE FRAME*****************************************************
            kwargs = params.copy()
            print kwargs
            timeoutSecs = 300
            hex_key = 'temp1000.hex'
            cfResult = h2o.nodes[0].create_frame(key=hex_key,
                                                 timeoutSecs=timeoutSecs,
                                                 **kwargs)

            # REBALANCE*****************************************************
            print "Rebalancing it to create an artificially large # of chunks"
            rb_key = "rb_%s" % (hex_key)
            start = time.time()
            print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key,
                                                           REBALANCE_CHUNKS)
            SEEDPERFILE = random.randint(0, sys.maxint)
            rebalanceResult = h2o.nodes[0].rebalance(source=hex_key,
                                                     after=rb_key,
                                                     chunks=REBALANCE_CHUNKS)
            elapsed = time.time() - start
            print "rebalance end on ", hex_key, 'to', rb_key, 'took', elapsed, 'seconds',\

            # TO ENUM*****************************************************
            print "Now doing to_enum across all columns of %s" % rb_key
            for column_index in range(params['cols']):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None,
                                              src_key=rb_key,
                                              column_index=column_index + 1)
                # print "\nto_enum result:", h2o.dump_json(result)
                summaryResult = h2o_cmd.runSummary(key=hex_key)
                # check that it at least is an enum column now, with no na's
                # just look at the column we touched
                column = summaryResult['summaries'][column_index]
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                stats = column['stats']
                stattype = stats['type']

                # we have some # of na's in the columns...but there should not be 100% NA
                if nacnt >= params['rows']:
                    raise Exception(
                        "column %s, which has name '%s', somehow too many NAs after convert to Enum  %s %s"
                        % (column_index, colname, nacnt, params['rows']))

                print "I suspect that columns that are constant, maybe with NAs also, don't convert to Enum"
                if stattype != 'Enum':
                    raise Exception(
                        "column %s, which has name '%s', didn't convert to Enum, is %s %s %s"
                        % (column_index, colname, stattype, coltype,
                           h2o.dump_json(column)))

                cardinality = stats['cardinality']
                # don't know the cardinality expected
                # if cardinality!=4:
                #     raise Exception("column %s, which has name '%s',  should have cardinality 4, got: %s" %
                #         (column_index, colname, cardinality))

                h2o_cmd.infoFromSummary(summaryResult)

            print "Trial #", trial, "completed"
Ejemplo n.º 9
0
    def test_KMeans_create_frame_fvec(self):
        for trial in range(20):

            cfParamDict = define_create_frame_params(SEED)
            # default
            params = {
                'rows': 5,
                'cols': 10
            }
            h2o_util.pickRandParams(cfParamDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)

            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None

            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0


            kwargs = params.copy()
            timeoutSecs = 300
            hex_key = 'temp_%s.hex' % trial
            cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs)
            inspect = h2o_cmd.runInspect(None, hex_key)
            print "\n%s" % hex_key, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])

            kmeansParamDict = define_KMeans_params(SEED)

            # default
            params = {
                'max_iter': 20, 
                'k': 1, 
                'destination_key': "KM_" + str(trial) + '.hex'
            }
            h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params)
            kwargs = params.copy()

            start = time.time()
            parseResult = {'destination_key': hex_key }
            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \
                timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs)
            elapsed = time.time() - start
            print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \
                "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)
            h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs)

            ### print h2o.dump_json(kmeans)

            print "Trial #", trial, "completed\n"
Ejemplo n.º 10
0
    def test_create_rebalance_2enum(self):
        h2o.beta_features = True
        # default
        params = {
            'rows': 100, 
            'cols': 1
        }
        for trial in range(20):
            # CREATE FRAME params################################################################
            h2o_util.pickRandParams(paramDict, params)
            i = params.get('integer_fraction', None)
            c = params.get('categorical_fraction', None)
            r = params.get('randomize', None)
            v = params.get('value', None)
            # h2o does some strict checking on the combinations of these things
            # fractions have to add up to <= 1 and only be used if randomize
            # h2o default randomize=1?
            if r:
                if not i:   
                    i = 0
                if not c:
                    c = 0
                if (i and c) and (i + c) >= 1.0:
                    c = 1.0 - i
                params['integer_fraction'] = i
                params['categorical_fraction'] = c
                params['value'] = None
                
            else:
                params['randomize'] = 0
                params['integer_fraction'] = 0
                params['categorical_fraction'] = 0


            # CREATE FRAME*****************************************************
            kwargs = params.copy()
            print kwargs
            timeoutSecs = 300
            hex_key='temp1000.hex'
            cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs)

            # REBALANCE*****************************************************
            print "Rebalancing it to create an artificially large # of chunks"
            rb_key = "rb_%s" % (hex_key)
            start = time.time()
            print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS)
            SEEDPERFILE = random.randint(0, sys.maxint)
            rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS)
            elapsed = time.time() - start
            print "rebalance end on ", hex_key, 'to', rb_key, 'took', elapsed, 'seconds',\

            # TO ENUM*****************************************************
            print "Now doing to_enum across all columns of %s" % rb_key
            for column_index in range(params['cols']):
                # is the column index 1-base in to_enum
                result = h2o.nodes[0].to_enum(None, src_key=rb_key, column_index=column_index+1)
                # print "\nto_enum result:", h2o.dump_json(result)
                summaryResult = h2o_cmd.runSummary(key=hex_key)
                # check that it at least is an enum column now, with no na's
                # just look at the column we touched
                column = summaryResult['summaries'][column_index]
                colname = column['colname']
                coltype = column['type']
                nacnt = column['nacnt']
                stats = column['stats']
                stattype = stats['type']

                # we have some # of na's in the columns...but there should not be 100% NA
                if nacnt>=params['rows']:
                    raise Exception("column %s, which has name '%s', somehow too many NAs after convert to Enum  %s %s" % 
                        (column_index, colname, nacnt, params['rows']))

                print "I suspect that columns that are constant, maybe with NAs also, don't convert to Enum"
                if stattype != 'Enum':
                    raise Exception("column %s, which has name '%s', didn't convert to Enum, is %s %s %s" %  
                        (column_index, colname, stattype, coltype, h2o.dump_json(column)))

                cardinality = stats['cardinality']
                # don't know the cardinality expected
                # if cardinality!=4:
                #     raise Exception("column %s, which has name '%s',  should have cardinality 4, got: %s" % 
                #         (column_index, colname, cardinality))

                h2o_cmd.infoFromSummary(summaryResult)
    
            print "Trial #", trial, "completed"