def test_create_frame_rand1(self): h2o.beta_features = True # default params = {'rows': 1, 'cols': 1} for trial in range(20): h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') h2o_cmd.runSummary(key='temp1000.hex', timeoutSecs=300) print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_create_frame_rand1(self): h2o.beta_features = True # default params = { 'rows': 1, 'cols': 1 } for trial in range(20): h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strick checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') h2o_cmd.runSummary(key='temp1000.hex') print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_speedrf_params_rand2_fvec(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' hex_key = 'covtype.data.hex' for trial in range(10): # params is mutable. This is default. # response is required for SpeeERF params = { 'response': 'C55', 'ntrees': 1, 'mtries': 7, 'balance_classes': 0, # never run with unconstrained balance_classes size if random sets balance_classes..too slow 'max_after_balance_size': 2, 'importance': 0} colX = h2o_util.pickRandParams(paramDict, params) if 'cols' in params and params['cols']: # exclusion if 'ignored_cols_by_name' in params: params['ignored_cols_by_name'] = None else: if 'ignored_cols_by_name' in params and params['ignored_cols_by_name']: params['mtries'] = random.randint(1,53) else: params['mtries'] = random.randint(1,54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 80 + ((kwargs['ntrees']*80) * max(1,kwargs['mtries']/60) ) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time()-start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
def test_create_frame_rand1(self): h2o.beta_features = True # default params = { 'rows': 1, 'cols': 1 } for trial in range(10): h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', 0) c = params.get('categorical_fraction', 0) r = params.get('randomize', 0) v = params.get('value', None) if r: if v is not None: # if these are None, they are treated as >0 (default > 0?) params['integer_fraction'] = 0 params['categorical_fraction'] = 0 elif (i and c) and (i + c) >= 1.0: params['integer_fraction'] = i params['categorical_fraction'] = 1.0 - i else: params['integer_fraction'] = 0 params['categorical_fraction'] = 0 params['value'] = None kwargs = params.copy() print kwargs timeoutSecs = 300 parseResult = h2i.import_parse(bucket='smalldata', path='poker/poker1000', hex_key='temp1000.hex', schema='put', timeoutSecs=timeoutSecs) cfResult = h2o.nodes[0].create_frame(key='temp1000.hex', timeoutSecs=timeoutSecs, **kwargs) if DO_DOWNLOAD: csvPathname = SYNDATASETS_DIR + '/' + 'temp1000.csv' h2o.nodes[0].csv_download(src_key='temp1000.hex', csvPathname=csvPathname, timeoutSecs=60) if DO_INSPECT: h2o_cmd.runInspect(key='temp1000.hex') h2o_cmd.runSummary(key='temp1000.hex') print h2o.dump_json(cfResult) print "Trial #", trial, "completed"
def test_bayes_and2(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(1): response = 'C55' params = { 'response': response, } colX = h2o_util.pickRandParams(paramDict, params) kwargs = params.copy() timeoutSecs = 120 # chagne response to factor execExpr = 'covtype.hex[,54+1] = factor(covtype.hex[,54+1] != 5)' # turn 7-class problem into binomial such that AUC can work below.. resultExec, ncols = h2e.exec_expr(execExpr=execExpr) start = time.time() bayesResult = h2o.nodes[0].naive_bayes(timeoutSecs=timeoutSecs, source='covtype.hex', **kwargs) print "bayes end on ", csvPathname, 'took', time.time( ) - start, 'seconds' print "bayes result:", h2o.dump_json(bayesResult) nb_model = bayesResult['nb_model'] ncats = nb_model['ncats'] nnums = nb_model['nnums'] pcond = nb_model['pcond'] pprior = nb_model['pprior'] rescnt = nb_model['rescnt'] modelClassDist = nb_model['_modelClassDist'] names = nb_model['_names'] domains = nb_model['_domains'] priorClassDist = nb_model['_priorClassDist'] model_key = nb_model['_key'] # is it an error to get std dev of 0 after predicting? print "Doing predict with same dataset, and the bayes model" h2o.nodes[0].generate_predictions(model_key=model_key, data_key='covtype.hex', prediction='Predict.hex') # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual='covtype.hex', predict='Predict.hex', vactual=response, vpredict=1) print "AUC result:", h2o.dump_json(resultAUC) print "Trial #", trial, "completed\n"
def test_bayes_and2(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put') paramDict = define_params() for trial in range(1): response = 'C55' params = { 'response': response, } colX = h2o_util.pickRandParams(paramDict, params) kwargs = params.copy() timeoutSecs = 120 # chagne response to factor execExpr = 'covtype.hex[,54+1] = factor(covtype.hex[,54+1] != 5)' # turn 7-class problem into binomial such that AUC can work below.. resultExec, ncols = h2e.exec_expr(execExpr=execExpr) start = time.time() bayesResult = h2o.nodes[0].naive_bayes(timeoutSecs=timeoutSecs, source='covtype.hex', **kwargs) print "bayes end on ", csvPathname, 'took', time.time() - start, 'seconds' print "bayes result:", h2o.dump_json(bayesResult) nb_model = bayesResult['nb_model'] ncats = nb_model['ncats'] nnums = nb_model['nnums'] pcond = nb_model['pcond'] pprior = nb_model['pprior'] rescnt = nb_model['rescnt'] modelClassDist = nb_model['_modelClassDist'] names = nb_model['_names'] domains = nb_model['_domains'] priorClassDist = nb_model['_priorClassDist'] model_key = nb_model['_key'] # is it an error to get std dev of 0 after predicting? print "Doing predict with same dataset, and the bayes model" h2o.nodes[0].generate_predictions(model_key=model_key, data_key='covtype.hex', prediction='Predict.hex') # just get a predict and AUC on the same data. has to be binomial result resultAUC = h2o.nodes[0].generate_auc(thresholds=None, actual='covtype.hex', predict='Predict.hex', vactual=response, vpredict=1) print "AUC result:", h2o.dump_json(resultAUC) print "Trial #", trial, "completed\n"
def test_speedrf_params_rand2_fvec(self): h2o.beta_features = True csvPathname = 'standard/covtype.data' hex_key = 'covtype.data.hex' for trial in range(10): # params is mutable. This is default. # response is required for SpeeERF params = { 'response': 'C55', 'ntrees': 1, 'mtries': 7, 'balance_classes': 0, # never run with unconstrained balance_classes size if random sets balance_classes..too slow 'max_after_balance_size': 2, 'importance': 0 } colX = h2o_util.pickRandParams(paramDict, params) if 'cols' in params and params['cols']: # exclusion if 'ignored_cols_by_name' in params: params['ignored_cols_by_name'] = None else: if 'ignored_cols_by_name' in params and params[ 'ignored_cols_by_name']: params['mtries'] = random.randint(1, 53) else: params['mtries'] = random.randint(1, 54) kwargs = params.copy() # adjust timeoutSecs with the number of trees timeoutSecs = 80 + ( (kwargs['ntrees'] * 80) * max(1, kwargs['mtries'] / 60)) start = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='put', hex_key=hex_key) h2o_cmd.runSpeeDRF(parseResult=parseResult, timeoutSecs=timeoutSecs, retryDelaySecs=1, **kwargs) elapsed = time.time() - start print "Trial #", trial, "completed in", elapsed, "seconds.", "%d pct. of timeout" % ( (elapsed * 100) / timeoutSecs)
def test_create_rebalance_2enum(self): # default params = {'rows': 100, 'cols': 1} for trial in range(20): # CREATE FRAME params################################################################ h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 # CREATE FRAME***************************************************** kwargs = params.copy() print kwargs timeoutSecs = 300 hex_key = 'temp1000.hex' cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs) # REBALANCE***************************************************** print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % (hex_key) start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) SEEDPERFILE = random.randint(0, sys.maxint) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", hex_key, 'to', rb_key, 'took', elapsed, 'seconds',\ # TO ENUM***************************************************** print "Now doing to_enum across all columns of %s" % rb_key for column_index in range(params['cols']): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=rb_key, column_index=column_index + 1) # print "\nto_enum result:", h2o.dump_json(result) summaryResult = h2o_cmd.runSummary(key=hex_key) # check that it at least is an enum column now, with no na's # just look at the column we touched column = summaryResult['summaries'][column_index] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # we have some # of na's in the columns...but there should not be 100% NA if nacnt >= params['rows']: raise Exception( "column %s, which has name '%s', somehow too many NAs after convert to Enum %s %s" % (column_index, colname, nacnt, params['rows'])) print "I suspect that columns that are constant, maybe with NAs also, don't convert to Enum" if stattype != 'Enum': raise Exception( "column %s, which has name '%s', didn't convert to Enum, is %s %s %s" % (column_index, colname, stattype, coltype, h2o.dump_json(column))) cardinality = stats['cardinality'] # don't know the cardinality expected # if cardinality!=4: # raise Exception("column %s, which has name '%s', should have cardinality 4, got: %s" % # (column_index, colname, cardinality)) h2o_cmd.infoFromSummary(summaryResult) print "Trial #", trial, "completed"
def test_KMeans_create_frame_fvec(self): for trial in range(20): cfParamDict = define_create_frame_params(SEED) # default params = { 'rows': 5, 'cols': 10 } h2o_util.pickRandParams(cfParamDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 kwargs = params.copy() timeoutSecs = 300 hex_key = 'temp_%s.hex' % trial cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n%s" % hex_key, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) kmeansParamDict = define_KMeans_params(SEED) # default params = { 'max_iter': 20, 'k': 1, 'destination_key': "KM_" + str(trial) + '.hex' } h2o_kmeans.pickRandKMeansParams(kmeansParamDict, params) kwargs = params.copy() start = time.time() parseResult = {'destination_key': hex_key } kmeans = h2o_cmd.runKMeans(parseResult=parseResult, \ timeoutSecs=timeoutSecs, retryDelaySecs=2, pollTimeoutSecs=60, **kwargs) elapsed = time.time() - start print "kmeans trial %s end on ", trial, 'took', elapsed, 'seconds.', \ "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) h2o_kmeans.simpleCheckKMeans(self, kmeans, **kwargs) ### print h2o.dump_json(kmeans) print "Trial #", trial, "completed\n"
def test_create_rebalance_2enum(self): h2o.beta_features = True # default params = { 'rows': 100, 'cols': 1 } for trial in range(20): # CREATE FRAME params################################################################ h2o_util.pickRandParams(paramDict, params) i = params.get('integer_fraction', None) c = params.get('categorical_fraction', None) r = params.get('randomize', None) v = params.get('value', None) # h2o does some strict checking on the combinations of these things # fractions have to add up to <= 1 and only be used if randomize # h2o default randomize=1? if r: if not i: i = 0 if not c: c = 0 if (i and c) and (i + c) >= 1.0: c = 1.0 - i params['integer_fraction'] = i params['categorical_fraction'] = c params['value'] = None else: params['randomize'] = 0 params['integer_fraction'] = 0 params['categorical_fraction'] = 0 # CREATE FRAME***************************************************** kwargs = params.copy() print kwargs timeoutSecs = 300 hex_key='temp1000.hex' cfResult = h2o.nodes[0].create_frame(key=hex_key, timeoutSecs=timeoutSecs, **kwargs) # REBALANCE***************************************************** print "Rebalancing it to create an artificially large # of chunks" rb_key = "rb_%s" % (hex_key) start = time.time() print "Rebalancing %s to %s with %s chunks" % (hex_key, rb_key, REBALANCE_CHUNKS) SEEDPERFILE = random.randint(0, sys.maxint) rebalanceResult = h2o.nodes[0].rebalance(source=hex_key, after=rb_key, chunks=REBALANCE_CHUNKS) elapsed = time.time() - start print "rebalance end on ", hex_key, 'to', rb_key, 'took', elapsed, 'seconds',\ # TO ENUM***************************************************** print "Now doing to_enum across all columns of %s" % rb_key for column_index in range(params['cols']): # is the column index 1-base in to_enum result = h2o.nodes[0].to_enum(None, src_key=rb_key, column_index=column_index+1) # print "\nto_enum result:", h2o.dump_json(result) summaryResult = h2o_cmd.runSummary(key=hex_key) # check that it at least is an enum column now, with no na's # just look at the column we touched column = summaryResult['summaries'][column_index] colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] # we have some # of na's in the columns...but there should not be 100% NA if nacnt>=params['rows']: raise Exception("column %s, which has name '%s', somehow too many NAs after convert to Enum %s %s" % (column_index, colname, nacnt, params['rows'])) print "I suspect that columns that are constant, maybe with NAs also, don't convert to Enum" if stattype != 'Enum': raise Exception("column %s, which has name '%s', didn't convert to Enum, is %s %s %s" % (column_index, colname, stattype, coltype, h2o.dump_json(column))) cardinality = stats['cardinality'] # don't know the cardinality expected # if cardinality!=4: # raise Exception("column %s, which has name '%s', should have cardinality 4, got: %s" % # (column_index, colname, cardinality)) h2o_cmd.infoFromSummary(summaryResult) print "Trial #", trial, "completed"